def update_features(self, indices): self.minimalp = 0 self.minimaln = 0 activation1, activation2 = self.get_activations() print("activation1") print(activation1) print("activation2") print(activation2) dat_znorm_p = znorm(activation1[indices]) dat_znorm_n = znorm(activation2[indices]) symRep_p = ts_to_string(dat_znorm_p, cuts_for_asize(self.symbols)) symRep_n = ts_to_string(dat_znorm_n, cuts_for_asize(self.symbols)) # transfer result to feature(string to tuple) feature_p = tuple(symRep_p) feature_n = tuple(symRep_n) print("found symbolic feature for SQ-P:", symRep_p) if feature_p in self.testObjective.feature_p: self.minimalp = 1 self.testObjective.feature_p.remove(feature_p) self.coverage_p = 1 - len(self.testObjective.feature_p ) / self.testObjective.originalNumOfFeature self.displayCoverage1() print("found symbolic feature for SQ-N:", symRep_n) if feature_n in self.testObjective.feature_n: self.minimaln = 1 self.testObjective.feature_n.remove(feature_n) self.coverage_n = 1 - len(self.testObjective.feature_n ) / self.testObjective.originalNumOfFeature self.displayCoverage2() return self.coverage_p, self.coverage_n
def maximize_level_node(self, max_level): """ Try to maximaxe the level value :param p_value: :return: """ values_group = list(self.group.values()) original_level = self.level equal = True while equal and self.level < max_level: temp_level = self.level + 1 data = np.array(values_group[0]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr = ts_to_string(data_paa, cuts_for_asize(temp_level)) for index in range(1, len(values_group)): data = np.array(values_group[index]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr_2 = ts_to_string(data_paa, cuts_for_asize(temp_level)) if pr_2 != pr: equal = False if equal: self.level = temp_level if original_level != self.level: logger.info("New level for node: {}".format(self.level)) data = np.array(values_group[0]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) self.pattern_representation = ts_to_string(data_paa, cuts_for_asize(self.level)) else: logger.info("Can't split again, max level already reached")
def row_pattern_loss(row: np.ndarray, pr: Tuple[str, int]): pattern = [] cuts = cuts_for_asize(pr[1] + 1)[1:] for c in pr[0]: n = ord(c) - 97 pattern.append(cuts[n]) if len(pattern) != len(row): normalized_row = paa(znorm(row), len(pattern)) else: normalized_row = znorm(row) return distance(normalized_row, pattern)
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', z_threshold=0.01): """Simple via window conversion implementation.""" cuts = cuts_for_asize(alphabet_size) sax = defaultdict(list) prev_word = '' for i in range(0, len(series) - win_size): sub_section = series[i:(i + win_size)] zn = znorm(sub_section, z_threshold) paa_rep = paa(zn, paa_size) curr_word = ts_to_string(paa_rep, cuts) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and\ is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
def apply_adaptive_sax(ts, win_size, paa_size, alphabet_size, z_threshold): """ This function applies the sax transformation to a 1-dim time series using adaptive break-points :param ts: 1-time series :type ts: 1D array :param win_size: size fo the sliding window that generated each sax word :type win_size: int :param paa_size: number of characters in a single sax word :type paa_size: int :param alphabet_size: number of unique characters to use in the sax representation :type alphabet_size: int :param z_threshold: z_threshold for the znorm method from saxpy :type z_threshold: float :return: the sax sequence, a list of strings, where each string represents a single sax word :rtype: list of str """ sax_sequence = [] cuts = cuts_for_asize(alphabet_size) for t in range(0, len(ts) - win_size + 1): ts_win = ts[t:(t + win_size)] ts_win_znormed = znorm(ts_win, z_threshold) paa_rep = paa(ts_win_znormed, paa_size) sax_word = ts_to_string(paa_rep, cuts) sax_sequence.append(sax_word) return sax_sequence
def find_discords_brute_force(series, win_size, num_discords=2, znorm_threshold=0.01): """Early-abandoned distance-based discord discovery.""" discords = list() globalRegistry = VisitRegistry(len(series) - win_size + 1) znorms = np.array([ znorm(series[pos:pos + win_size], znorm_threshold) for pos in range(len(series) - win_size + 1) ]) while len(discords) < num_discords: bestDiscord = find_best_discord_brute_force(series, win_size, globalRegistry, znorms) if -1 == bestDiscord[0]: break discords.append(bestDiscord) mark_start = max(0, bestDiscord[0] - win_size + 1) mark_end = bestDiscord[0] + win_size globalRegistry.mark_visited_range(mark_start, mark_end) return discords
def test_znorm(): """Test the znorm implementation.""" # test std is 1 and mean is 0 ts = array([-1., -2., -1., 0., 2., 1., 1., 0.]) z_thrsh = 0.001 x_scaled = [x / 100.0 for x in ts] assert pytest.approx(1.0, 0.000001) == std(znorm.znorm(x_scaled, z_thrsh)) assert pytest.approx(0.0, 0.000001) == mean(znorm.znorm(x_scaled, z_thrsh)) # test std and mean wouldnt change on hi threshold ts = array([-0.1, -0.2, 0.2, 0.1]) z_thrsh = 0.5 ts_mean = mean(ts) ts_sd = std(ts) assert ts_mean == mean(znorm.znorm(ts, z_thrsh)) assert ts_sd == std(znorm.znorm(ts, z_thrsh))
def find_best_discord_brute_force(series, win_size, global_registry, z_threshold=0.01): """Early-abandoned distance-based discord discovery.""" best_so_far_distance = -1.0 best_so_far_index = -1 outerRegistry = global_registry.clone() outer_idx = outerRegistry.get_next_unvisited() while ~np.isnan(outer_idx): outerRegistry.mark_visited(outer_idx) candidate_seq = znorm(series[outer_idx:(outer_idx + win_size)], z_threshold) nnDistance = np.inf innerRegistry = VisitRegistry(len(series) - win_size) inner_idx = innerRegistry.get_next_unvisited() while ~np.isnan(inner_idx): innerRegistry.mark_visited(inner_idx) if abs(inner_idx - outer_idx) > win_size: curr_seq = znorm(series[inner_idx:(inner_idx + win_size)], z_threshold) dist = early_abandoned_dist(candidate_seq, curr_seq, nnDistance) if (~np.isnan(dist)) and (dist < nnDistance): nnDistance = dist inner_idx = innerRegistry.get_next_unvisited() if ~(np.inf == nnDistance) and (nnDistance > best_so_far_distance): best_so_far_distance = nnDistance best_so_far_index = outer_idx outer_idx = outerRegistry.get_next_unvisited() return (best_so_far_index)
def SAX(sequence: np.ndarray, alphabet_size: int, length: int = 0) -> str: """ Computes SAX string of a sequence of numbers with specified alphabet size. Length of the output string may be specified; length 0 will generate a string as long as the sequence. """ debug("Calculating SAX of {}, with alphabet of size {}".format( sequence, alphabet_size)) if alphabet_size == 1: if length == 0: return "a" * len(sequence) else: return "a" * length else: if length == 0 or length == len(sequence): return ts_to_string(znorm(sequence), cuts_for_asize(alphabet_size)) else: return ts_to_string(paa(znorm(sequence), length), cuts_for_asize(alphabet_size))
def _preprocess_ts(time_series_df): """ z-normalizes the time series' numeric values """ del time_series_df["time_delta_in_days"] time_series_df["value_representation"] = znorm( time_series_df.numeric_value) del time_series_df["numeric_value"] return time_series_df
def saxrepresentation(matrix): result = [] index_ts = 0 for ts in matrix.T: sax_representation = znorm(ts) dat_paa_3 = paa(sax_representation, 3) a = ts_to_string(dat_paa_3, cuts_for_asize(3)) result.append(a) return result
def update_features(self, data): self.hidden = self.state_manager.get_hidden_state(data) activation = self.get_activation() dat_znorm = znorm(activation[self.indices]) sym_rep = ts_to_string(dat_znorm, cuts_for_asize(self.symbols)) feature = tuple(sym_rep) if feature in self.feature: index = self.feature.index(feature) self.covered_dict[index] = True
def update_features(self, data): self.hidden = self.state_manager.get_hidden_state(data) activation_p, activation_n = self.get_activation() dat_znorm_p = znorm(activation_p[self.indices]) dat_znorm_n = znorm(activation_n[self.indices]) sym_rep_p = ts_to_string(dat_znorm_p, cuts_for_asize(self.symbols)) sym_rep_n = ts_to_string(dat_znorm_n, cuts_for_asize(self.symbols)) feature_p = tuple(sym_rep_p) feature_n = tuple(sym_rep_n) if feature_p in self.feature_p: index = self.feature_p.index(feature_p) self.covered_dict_p[index] = True self.frequency_dict_p[index] += 1 if feature_n in self.feature_n: index = self.feature_n.index(feature_n) self.covered_dict_n[index] = True self.frequency_dict_n[index] += 1
def ppa_representation(data, seq_len): data_reduced = np.zeros(shape=(int(data.shape[0] / seq_len), data.shape[1])) paa_segment = int(data.shape[0] / seq_len) for i in tqdm(range(data.shape[1])): dat_znorm = znorm(data[:, i]) data_reduced[:, i] = paa(dat_znorm, paa_segment) return data_reduced
def find_discords_hotsax(series, win_size=100, num_discords=2, alphabet_size=3, paa_size=3, znorm_threshold=0.01, sax_type='unidim'): """HOT-SAX-driven discords discovery.""" discords = list() global_registry = set() # Z-normalized versions for every subsequence. znorms = np.array([ znorm(series[pos:pos + win_size], znorm_threshold) for pos in range(len(series) - win_size + 1) ]) # SAX words for every subsequence. sax_data = sax_via_window(series, win_size=win_size, paa_size=paa_size, alphabet_size=alphabet_size, nr_strategy=None, znorm_threshold=0.01, sax_type=sax_type) """[2.0] build the 'magic' array""" magic_array = list() for k, v in sax_data.items(): magic_array.append((k, len(v))) """[2.1] sort it ascending by the number of occurrences""" magic_array = sorted(magic_array, key=lambda tup: tup[1]) while len(discords) < num_discords: best_discord = find_best_discord_hotsax(series, win_size, global_registry, sax_data, magic_array, znorms) if -1 == best_discord[0]: break discords.append(best_discord) mark_start = max(0, best_discord[0] - win_size + 1) mark_end = best_discord[0] + win_size for i in range(mark_start, mark_end): global_registry.add(i) return discords
def discretize_data(data, w, features, start_date, end_date, dataset, plotting): """ Function that performs SAX discretization on the signals :param data: the signals to be discretized :param w: the number of PAA segments to represent the initial time series :param features: the name of the signals to be discretized :param start_date: date used mostly for visualization reasons :param end_date: date used mostly for visualization reasons :param dataset: the type of the dataset (used for saving reasons) :param plotting: if True then the discretized signals are plotted :return: the discretized series with the indices of each PAA segment """ alphabet = 5 # the length of the alphabet to be used # dictionaries used for plotting reasons symbol_to_number = {'a': -1.5, 'b': -0.75, 'c': 0, 'd': 0.75, 'e': 1.5} # just for visualization purposes number_to_symbol = {'0': 'a', '1': 'b', '2': 'c', '3': 'd', '4': 'e'} sax_seqs = {} sax_indices = {} for feature in features: print('------------------------------------- Discretizing %s -------------------------------------' % feature) sax_str, real_indices = sax.to_letter_rep(np.array(data[feature]), w, alphabet) # SAX discretization sax_seqs[feature] = sax_str # store the discretized series sax_indices[feature] = real_indices # store the indices of the real blocks # plotting part if plotting: normalized_signal = znorm(np.array(data[feature])) discrete = uncompress_labels(sax_str, real_indices) discrete = [symbol_to_number[number_to_symbol[d]] for d in discrete] fig = plt.figure() ax = fig.add_subplot(111) ax.plot(pd.DataFrame(normalized_signal, index=data.index).loc[start_date:end_date], label='normalized signal') ax.plot(pd.DataFrame(discrete, index=data.index).loc[start_date:end_date], label='discretized signal') ax.xaxis.set_major_locator(mdates.DayLocator([5, 10, 15, 20, 25, 30])) ax.xaxis.set_major_formatter(mdates.DateFormatter('%d/%m')) plt.xlabel("time") plt.ylabel(feature) plt.xticks(rotation=45) plt.yticks(np.array(list(symbol_to_number.values())), tuple(symbol_to_number.keys())) plt.grid() plt.legend(loc='lower right') plt.savefig('plots/sax/%s_discretization_%s.png' % (dataset, feature), bbox_inches='tight') return sax_seqs, sax_indices
def extract_features( song_name): # returns mfcc and chroma features in SAX represetation try: x, fs = librosa.load(song_name) except: return None mfccs = librosa.feature.mfcc(x, sr=fs, n_mfcc=39) chroma = librosa.feature.chroma_stft(x, sr=fs) feature_matrix = np.concatenate((mfccs, chroma)) sax_rep = [] sax_rep = [ ts_to_string(paa(znorm(feat), SAX_VOCAB_LENGTH), cuts_for_asize(SAX_VOCAB_LENGTH)) for feat in feature_matrix ] return sax_rep
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', z_threshold=0.01): """Simple via window conversion implementation.""" # 生成指定size的alphabet cuts cuts = cuts_for_asize(alphabet_size) # 初始化sax sax = defaultdict(list) prev_word = '' for i in range(0, len(series) - win_size): # series被当前窗口所围住的子部分 sub_section = series[i:(i + win_size)] # 标准化 zn = znorm(sub_section, z_threshold) # PAA分段聚合 将子部分降维到paa_size维 paa_rep = paa(zn, paa_size) # PAA后的序列转化为字符串 curr_word = ts_to_string(paa_rep, cuts) # if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and\ is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
def data_creator(ticker): import fix_yahoo_finance as yf from saxpy.znorm import znorm hist = yf.download(tickers = ticker, period = 'max') hist = hist["Close"] pc = [(hist[i + 1] - hist[i])/hist[i] for i in range(len(hist) -1)] pc2 = znorm(pc) pc3 = [np.floor(c) for c in pc2] X = ent.util_pattern_space(pc2, lag = 1, dim = 21) X.shape trainY = X[:,-1] trainX = X[:,:-1] trainY = np.where(trainY <= -4, True, False) drops = np.where(trainY == True) return trainX, trainY, drops tick_list = ["VTI","VOO","VEA","VWO","VTV","VUG", "VO","VB","VEU","VIG","VHT","VFH","VPL", "VPU","VSS","VGK","VOT","VSS","VAS","VGT", "EFA","EWA","EWH","EWG","EWU","EWQ","EWL","EWP", "EWD","EWN","EWI","ERUS","UAE","EIS","INDA"]
def znorm_paa_sax(time_series, alpha, w=3, missing='z'): """Takes an array containing real values, z-normalizes, reduces dimensionality to w, and finally returns a sax representation of length alpha time series: array holding a time series of one measurement for one patient w: the dimensionality to reduce to using PAA, set to len(time_series) in plain alpha: alpha is the number of discretized segments that the SAX rep will reflect, set to 2, 3 or 5 in plain using RDS algo """ # If time_series is a string, make it into list format e.g. 'abc' -> ['a', 'b', 'c'] # why? because it's the structure we require for below and i CBA to change it if (isinstance(time_series, str)): time_series = list(time_series) if (len(time_series) > 0): # normalizing one time series, time series as numpy array (np.array([])) normalized_time_series = znorm(np.array(time_series)) # dimensionality reduction of time series according to w paa_norm_time_series = paa(normalized_time_series, w) # turning a discretized and reduced time series into a sequence of characters return ts_to_string(paa_norm_time_series, cuts_for_asize(alpha)) else: return missing
def PAA_aggregation(v): dat_znorm = znorm(v) r = paa(dat_znorm, len(v)) #r = znorm(v) print("MIN MAX",min(r),max(r)) return r
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', znorm_threshold=0.01, sax_type='unidim'): """Simple via window conversion implementation. # SAX-ENERGY >>> sax_via_window([[1, 2, 3], [4, 5, 6]], win_size=1, paa_size=3, sax_type='energy', nr_strategy=None)['abc'] [0, 1] >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=1, paa_size=4, sax_type='energy', nr_strategy=None)['aacc'] [0, 1] >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=4, sax_type='energy', nr_strategy=None)['aaccaacc'] [0] # SAX-REPEAT >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=2, paa_size=2, sax_type='repeat', nr_strategy=None)['ab'] [0, 1] >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=1, paa_size=1, sax_type='repeat', nr_strategy=None)['a'] [0, 1, 2] # SAX-INDEPENDENT >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acacacac'] [0] >>> sax_via_window([[1, 2], [4, 5], [7, 8]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac'] [0, 1] >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac'] [0] >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acca'] [1] """ # Convert to numpy array. series = np.array(series) # Check on dimensions. if len(series.shape) > 2: raise ValueError('Please reshape time-series to stack dimensions along the 2nd dimension, so that the array shape is a 2-tuple.') # PAA size is the length of the PAA sequence. if sax_type != 'energy' and paa_size > win_size: raise ValueError('PAA size cannot be greater than the window size.') if sax_type == 'energy' and len(series.shape) == 1: raise ValueError('Must pass a multidimensional time-series to SAX-ENERGY.') # Breakpoints. cuts = cuts_for_asize(alphabet_size) # Dictionary mapping SAX words to indices. sax = defaultdict(list) if sax_type == 'repeat': # Maps indices to multi-dimensional SAX words. multidim_sax_dict = [] # List of all the multi-dimensional SAX words. multidim_sax_list = [] # Sliding window across time dimension. for i in range(series.shape[0] - win_size + 1): # Subsection starting at this index. sub_section = series[i: i + win_size] # Z-normalized subsection. if win_size == 1: zn = sub_section else: zn = znorm(sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, 'repeat') # SAX representation of subsection, but in terms of multi-dimensional vectors. multidim_sax = get_sax_list(paa_rep, cuts) # Update data-structures. multidim_sax_dict.append(multidim_sax) multidim_sax_list.extend(multidim_sax) # Cluster with k-means++. kmeans = KMeans(n_clusters=alphabet_size, random_state=0).fit(multidim_sax_list) # Cluster indices in sorted order. order = np.lexsort(np.rot90(kmeans.cluster_centers_)) # Sliding window across time dimension. prev_word = '' for i in range(series.shape[0] - win_size + 1): # Map cluster indices to new SAX letters. curr_word_list = map(lambda cluster_index: idx2letter(order[cluster_index]), kmeans.predict(multidim_sax_dict[i])) curr_word = ''.join(curr_word_list) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) else: # Sliding window across time dimension. prev_word = '' for i in range(series.shape[0] - win_size + 1): # Subsection starting at this index. sub_section = series[i: i + win_size] if sax_type == 'energy': curr_word = '' for energy_dist in sub_section: # Normalize energy distribution. energy_zn = znorm(energy_dist, znorm_threshold) # PAA representation of energy distribution. paa_rep = paa(energy_zn, paa_size, 'unidim') # paa_rep = energy_zn # SAX representation of the energy distribution. energy_word = ts_to_string(paa_rep, cuts) # Add to current word. curr_word += energy_word elif sax_type == 'independent': curr_word = '' for dim in range(sub_section.shape[1]): # Obtain the subsequence restricted to one dimension. one_dimension_sub_section = sub_section[:, dim] # Z-normalized subsection. zn = znorm(one_dimension_sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, 'unidim') # Get the SAX word - just a unidimensional SAX. one_dim_word = ts_to_string(paa_rep, cuts) # Add this dimensions' representation to the overall SAX word. curr_word += one_dim_word else: # Z-normalized subsection. zn = znorm(sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, sax_type) # SAX representation of subsection. curr_word = ts_to_string(paa_rep, cuts) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
print(sys.argv[1].strip().split(',')) flag = '' if sys.argv[1] != '' and sys.argv[1] != 'a': index_to_plot = np.array(sys.argv[1].strip().split(',')) index_to_plot = index_to_plot.astype(int) else: flag = 'a' series = np.genfromtxt('open_prices', delimiter='\n', missing_values='null', filling_values=0) all_series = np.asfarray(np.split(series, 57), float) all_series = all_series[:, : -1] # removing the last element 'null' from all series (n=255-1=254) for i in range(0, 57): all_series[i] = znorm(all_series[i]) print(all_series) with open('open_prices', 'r') as f: cur_index = 0 # print(index_to_plot) counter = 0 # Used to iterate through the multiple TS indices that need to be plotted, passed as arguments # print(all_series) print(all_series.shape) if flag == 'a': for series in all_series: plt.plot(series, label=str(cur_index)) else: for series in all_series: if counter < len( index_to_plot) and cur_index == index_to_plot[counter]: print(znorm(series))
fig, ax = plt.subplots(5, 3) plotcounter=1 to_plot=[] to_label_legend=[] fig = plt.figure(figsize=(25, 10)) for i in range(0,len(index_to_plot)): if index_to_plot[i] != -100: #-100 because after adjustment to make 0-indexed, -99 becomes -100 to_plot.append(index_to_plot[i]) # print(index_to_plot[i]) to_label_legend.append(labels[index_to_plot[i]]) else: fig.add_subplot(5,3,plotcounter) for series in all_series: if counter < len(to_plot) and cur_index == to_plot[counter]: # print(series) # print(znorm(series)) plt.plot(znorm(series[:-1])) counter+=1 cur_index+=1 cur_index = 0 plt.legend(to_label_legend, fontsize=5) # plt.title('Plot '+str(plotcounter)) counter=0 plotcounter+=1 to_plot=[] to_label_legend=[] plt.savefig('Plots/diff_znorm_comparison_nested_'+str(P)+'_'+linkage_method+'_w='+str(w)+'_a='+str(a)+'_lsh_limit='+str(lsh_limit)) plt.show()
from keras.layers import Dense, Bidirectional, LSTM, TimeDistributed from keras.optimizers import Adam ########## hist = yf.download(tickers = "DJI", period = 'max') words = [] dow_df = ent.util_pattern_space(hist_sma, lag = 1, dim = 50) dow_df = dow_df[:] for i in range(len(dow_df)): dat_znorm = znorm(dow_df[i,:]) dat_paa= paa(dat_znorm, 3) word = ts_to_string(dat_paa, cuts_for_asize(2)) words.append(word) print(words) print(collections.Counter(words)) from sklearn.preprocessing import LabelEncoder le=LabelEncoder() sqn = le.fit_transform(words) nb_classes = len(np.unique(sqn)) from keras.utils import to_categorical
def discretise(data, number_of_bins): return ts_to_string(znorm(data), cuts_for_asize(number_of_bins))
def find_best_discord_hotsax(series, win_size, a_size, paa_size, znorm_threshold, globalRegistry): # noqa: C901 """Find the best discord with hotsax.""" """ [1.0] get the sax data first 将一个 time series 转化为 SAX字典 (key: 字符串, value: 窗口索引组成的列表) """ sax_none = sax_via_window(series, win_size, a_size, paa_size, "none", 0.01) """ [2.0] build the 'magic' array magic_array: a list of tuples (字符串, 窗口索引个数) """ magic_array = list() for k, v in sax_none.items(): magic_array.append((k, len(v))) """ [2.1] sort it desc by the key 按照 窗口索引个数降序 对 tuple 排序 """ m_arr = sorted(magic_array, key=lambda tup: tup[1]) """ [3.0] define the key vars bestSoFarPosition bestSoFarDistance对应的窗口开始索引 这个窗口是该时间序列的异常子序列 bestSoFarDistance max(min(distance)) 对于每一个窗口, 我们求出它与其他窗口的最小距离 对所有的最小距离取一个最大值 """ bestSoFarPosition = -1 bestSoFarDistance = 0. distanceCalls = 0 visit_array = np.zeros(len(series), dtype=np.int) """[4.0] and we are off iterating over the magic array entries""" for entry in m_arr: """[5.0] some moar of teh vars""" curr_word = entry[0] # occurrences 当前word 的窗口索引列表 occurrences = sax_none[curr_word] """[6.0] jumping around by the same word occurrences makes it easier to nail down the possibly small distance value 通过在相同的单词之间 跳转, 使得更容易确定可能的小距离值 -- so we can be efficient and all that...""" # curr_pos 当前窗口索引 开始索引 for curr_pos in occurrences: # 若 已经在 globalRegistry 跳出本次循环 if curr_pos in globalRegistry: continue """[7.0] we don't want an overlapping subsequence""" # 避免 重复的子序列 mark_start = curr_pos - win_size mark_end = curr_pos + win_size # 我们要找到 与 当前窗口 相似性最大的(距离最小的)窗口, 而 visit_set 定义我们已经看过的窗口的开始索引 visit_set = set(range(mark_start, mark_end)) """[8.0] here is our subsequence in question""" # cur_seq 标准化的子序列 cur_seq = znorm(series[curr_pos:(curr_pos + win_size)], znorm_threshold) """[9.0] let's see what is NN distance""" # 定义 nn_dist 为: 当前窗口 与 其他窗口 的最小距离 (两窗口 不能有重复部分 且 不能相邻?) nn_dist = np.inf # 定义 bool 是否进行随机搜索 do_random_search = 1 """[10.0] ordered by occurrences search first""" # 通过在相同的单词之间 跳转, 使得更容易确定可能的小距离值 for next_pos in occurrences: """[11.0] skip bad pos""" # 避免 重复子序列 if next_pos in visit_set: continue else: visit_set.add(next_pos) """[12.0] distance we compute""" dist = euclidean( cur_seq, znorm(series[next_pos:(next_pos + win_size)], znorm_threshold)) distanceCalls += 1 """[13.0] keep the books up-to-date""" # 更新 nn_dist if dist < nn_dist: nn_dist = dist if dist < bestSoFarDistance: do_random_search = 0 break """[13.0] if not broken above, we shall proceed with random search""" # 上面循环正常结束 并没有提前跳出 那我们就要进行随机搜索 if do_random_search: """[14.0] build that random visit order array""" curr_idx = 0 for i in range(0, (len(series) - win_size)): # 为什么不是 len(series) - win_size + 1 # 当前窗口开始索引 在上面没有查看过 if not (i in visit_set): # 将其添加到 visit_array 中 visit_array[curr_idx] = i curr_idx += 1 # 此时 curr_idx 为 在上面没查看过的窗口开始索引的个数 # 打乱顺序 it_order = np.random.permutation(visit_array[0:curr_idx]) curr_idx -= 1 """[15.0] and go random""" while curr_idx >= 0: # 随机选择 窗口开始索引 it_order[curr_idx] rand_pos = it_order[curr_idx] curr_idx -= 1 dist = euclidean( cur_seq, znorm(series[rand_pos:(rand_pos + win_size)], znorm_threshold)) distanceCalls += 1 """[16.0] keep the books up-to-date again""" # 更新 nn_dist if dist < nn_dist: nn_dist = dist if dist < bestSoFarDistance: nn_dist = dist break """[17.0] and BIGGER books""" # 更新 bestSoFarDistance 和 bestSoFarPosition if (nn_dist > bestSoFarDistance) and (nn_dist < np.inf): bestSoFarDistance = nn_dist bestSoFarPosition = curr_pos return (bestSoFarPosition, bestSoFarDistance)
def start_splitting(self, p_value: int, max_level: int, good_leaf_nodes: list(), bad_leaf_nodes: list()): """ Splitting Node Naive algorithm (k, P) Anonymity :param p_value: :param max_level: :param paa_value :return: """ # logger.info("good_leaf_nodes: {}, bad_leaf_nodes: {}".format(len(good_leaf_nodes), len(bad_leaf_nodes))) if self.size < p_value: logger.info("size:{}, p_value:{} == bad-leaf".format(self.size, p_value)) self.label = "bad-leaf" bad_leaf_nodes.append(self) return if self.level == max_level: logger.info("size:{}, p_value:{} == good-leaf".format(self.size, p_value)) self.label = "good-leaf" good_leaf_nodes.append(self) return if p_value <= self.size < 2*p_value: logger.info("Maximize-level, size:{}, p_value:{} == good-leaf".format(self.size, p_value)) self.maximize_level_node(max_level) self.label = "good-leaf" good_leaf_nodes.append(self) return """ Otherwise, we need to check if node N has to be split. The checking relies on a tentative split performed on N. Suppose that, by increasing the level of N, N is tentatively split into a number of child nodes. If all these child nodes contain fewer than P time series, no real split is performed and the original node N is labeled as good-leaf and the recursion terminates on N. Otherwise, there must exist tentative child node(s) whose size >= P, also called TG-node(s) (Tentative Good Nodes). The rest children whose size < P are called TB-nodes (Tentative Bad Nodes), if any. If the total number of records in all TB-nodes under N is no less than P, we merge them into a single tentative node, denoted by childmerge, at the level of N.level. If the above tentative process produces nc tentative child nodes (including TB and TG) and nc >= 2, N will really be split into nc children and then the node splitting procedure will be recursively invoked on each of them """ tentative_child_node = dict() temp_level = self.level + 1 for key, value in self.group.items(): # to reduce dimensionality data = np.array(value) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr = ts_to_string(data_paa, cuts_for_asize(temp_level)) if pr in tentative_child_node.keys(): tentative_child_node[pr].append(key) else: tentative_child_node[pr] = [key] length_all_tentative_child = [len(x) for x in list(tentative_child_node.values())] good_leaf = np.all(np.array(length_all_tentative_child) < p_value) if good_leaf: logger.info("Good-leaf, all_tentative_child are < {}".format(p_value)) self.label = "good-leaf" good_leaf_nodes.append(self) return else: logger.info("N can be split") logger.info("Compute tentative good nodes and tentative bad nodes") # tentative good nodes # index of nodes in tentative_child_node with more p_value pr_keys = list(tentative_child_node.keys()) # get index tentative good node pattern_representation_tg = list() tg_nodes_index = list(np.where(np.array(length_all_tentative_child) >= p_value)[0]) # logger.info(pr_keys) tg_nodes = list() for index in tg_nodes_index: keys_elements = tentative_child_node[pr_keys[index]] dict_temp = dict() for key in keys_elements: dict_temp[key] = self.group[key] tg_nodes.append(dict_temp) pattern_representation_tg.append(pr_keys[index]) # tentative bad nodes tb_nodes_index = list(np.where(np.array(length_all_tentative_child) < p_value)[0]) tb_nodes = list() pattern_representation_tb = list() for index in tb_nodes_index: keys_elements = tentative_child_node[pr_keys[index]] dict_temp = dict() for key in keys_elements: dict_temp[key] = self.group[key] tb_nodes.append(dict_temp) pattern_representation_tb.append(pr_keys[index]) total_size_tb_nodes = 0 for tb_node in tb_nodes: total_size_tb_nodes += len(tb_node) if total_size_tb_nodes >= p_value: logger.info("Merge all bad nodes in a single node, and label it as good-leaf") child_merge_node_group = dict() for tb_node in tb_nodes: for key, value in tb_node.items(): child_merge_node_group[key] = value node_merge = Node(level=self.level, pattern_representation=self.pattern_representation, label="good-leaf", group=child_merge_node_group, parent=self) self.child_node.append(node_merge) good_leaf_nodes.append(node_merge) nc = len(tg_nodes) + len(tb_nodes) # tb_nodes sono un po' perplesso su questo tb_nodes logger.info("Split only tg_nodes {0}".format(len(tg_nodes))) if nc >= 2: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="intermediate", group=tg_nodes[index], parent=self) self.child_node.append(node) node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes) else: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="good-leaf", group=tg_nodes[index], parent=self) self.child_node.append(node) good_leaf_nodes.append(node) else: nc = len(tg_nodes) + len(tb_nodes) # tb_nodes sono un po' perplesso su questo tb_nodes logger.info("Label all tb_node {0} as bad-leaf and split only tg_nodes {1}".format(len(tb_nodes),len(tg_nodes))) for index in range(0, len(tb_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tb[index], label="bad-leaf", group=tb_nodes[index], parent=self) self.child_node.append(node) bad_leaf_nodes.append(node) if nc >= 2: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="intermediate", group=tg_nodes[index], parent=self) self.child_node.append(node) node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes) else: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="good-leaf", group=tg_nodes[index], parent=self) self.child_node.append(node) good_leaf_nodes.append(node)
def sax_by_chunking(series, paa_size, alphabet_size=3, z_threshold=0.01): """Simple chunking conversion implementation.""" paa_rep = paa(znorm(series, z_threshold), paa_size) cuts = cuts_for_asize(alphabet_size) return ts_to_string(paa_rep, cuts)
def find_best_discord_brute_force(series, win_size, global_registry, z_threshold=0.01): """Early-abandoned distance-based discord discovery.""" best_so_far_distance = -1.0 best_so_far_index = -1 outerRegistry = global_registry.clone() # 随机找到一个未看过的index outer_idx = outerRegistry.get_next_unvisited() # 若 outer_idx 不是nan值 则进入循环 ~按位取反 while ~np.isnan(outer_idx): # 标记看过outer_idx outerRegistry.mark_visited(outer_idx) # 标准化候选子序列 开始索引outer_indx 结束索引outer_idx+win_size-1 candidate_seq = znorm(series[outer_idx:(outer_idx + win_size)], z_threshold) # 与candidate_seq的开始索引相距不小于窗口大小的 开始索引所代表的子序列 与其的最小距离 (两子序列形状相似) nnDistance = np.inf # 为什么不是 len(series) - win_size + 1 ??? innerRegistry = VisitRegistry(len(series) - win_size) inner_idx = innerRegistry.get_next_unvisited() # 遍历所有开始索引 在两子序列距离大于窗口大小的条件下 找到与candidate_seq的最近距离nnDistance while ~np.isnan(inner_idx): innerRegistry.mark_visited(inner_idx) # 若 inner_indx 与 outer_idx 距离 大于 窗口大小 即两子序列不能有重复部分且不相邻 if abs(inner_idx - outer_idx) > win_size: curr_seq = znorm(series[inner_idx:(inner_idx + win_size)], z_threshold) # 计算 标准化后两序列的欧式距离 dist = early_abandoned_dist(candidate_seq, curr_seq, nnDistance) # 更新 nnDistance 使其逐渐变小 if (~np.isnan(dist)) and (dist < nnDistance): nnDistance = dist inner_idx = innerRegistry.get_next_unvisited() # 更新 best_so_far_distance 和 best_so_far_index """ best_so_far_distance max(min(distance)) 相似性最小的子序列 与 距离最近的子序列 的距离 best_so_far_index 当前时间序列的异常子序列 开始索引 这段子序列在当前时间序列中与其他子序列的相似性是最小的 """ if ~(np.inf == nnDistance) and (nnDistance > best_so_far_distance): best_so_far_distance = nnDistance best_so_far_index = outer_idx outer_idx = outerRegistry.get_next_unvisited() return (best_so_far_index, best_so_far_distance)