def maximize_level_node(self, max_level): """ Try to maximaxe the level value :param p_value: :return: """ values_group = list(self.group.values()) original_level = self.level equal = True while equal and self.level < max_level: temp_level = self.level + 1 data = np.array(values_group[0]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr = ts_to_string(data_paa, cuts_for_asize(temp_level)) for index in range(1, len(values_group)): data = np.array(values_group[index]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr_2 = ts_to_string(data_paa, cuts_for_asize(temp_level)) if pr_2 != pr: equal = False if equal: self.level = temp_level if original_level != self.level: logger.info("New level for node: {}".format(self.level)) data = np.array(values_group[0]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) self.pattern_representation = ts_to_string(data_paa, cuts_for_asize(self.level)) else: logger.info("Can't split again, max level already reached")
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', z_threshold=0.01): """Simple via window conversion implementation.""" cuts = cuts_for_asize(alphabet_size) sax = defaultdict(list) prev_word = '' for i in range(0, len(series) - win_size): sub_section = series[i:(i + win_size)] zn = znorm(sub_section, z_threshold) paa_rep = paa(zn, paa_size) curr_word = ts_to_string(paa_rep, cuts) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and\ is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
def apply_adaptive_sax(ts, win_size, paa_size, alphabet_size, z_threshold): """ This function applies the sax transformation to a 1-dim time series using adaptive break-points :param ts: 1-time series :type ts: 1D array :param win_size: size fo the sliding window that generated each sax word :type win_size: int :param paa_size: number of characters in a single sax word :type paa_size: int :param alphabet_size: number of unique characters to use in the sax representation :type alphabet_size: int :param z_threshold: z_threshold for the znorm method from saxpy :type z_threshold: float :return: the sax sequence, a list of strings, where each string represents a single sax word :rtype: list of str """ sax_sequence = [] cuts = cuts_for_asize(alphabet_size) for t in range(0, len(ts) - win_size + 1): ts_win = ts[t:(t + win_size)] ts_win_znormed = znorm(ts_win, z_threshold) paa_rep = paa(ts_win_znormed, paa_size) sax_word = ts_to_string(paa_rep, cuts) sax_sequence.append(sax_word) return sax_sequence
def saxrepresentation(matrix): result = [] index_ts = 0 for ts in matrix.T: sax_representation = znorm(ts) dat_paa_3 = paa(sax_representation, 3) a = ts_to_string(dat_paa_3, cuts_for_asize(3)) result.append(a) return result
def ppa_representation(data, seq_len): data_reduced = np.zeros(shape=(int(data.shape[0] / seq_len), data.shape[1])) paa_segment = int(data.shape[0] / seq_len) for i in tqdm(range(data.shape[1])): dat_znorm = znorm(data[:, i]) data_reduced[:, i] = paa(dat_znorm, paa_segment) return data_reduced
def row_pattern_loss(row: np.ndarray, pr: Tuple[str, int]): pattern = [] cuts = cuts_for_asize(pr[1] + 1)[1:] for c in pr[0]: n = ord(c) - 97 pattern.append(cuts[n]) if len(pattern) != len(row): normalized_row = paa(znorm(row), len(pattern)) else: normalized_row = znorm(row) return distance(normalized_row, pattern)
def update_features(self, data): self.hidden = self.state_manager.get_hidden_state([data]) activation = self.get_activation() dat_znorm = (activation[:, self.indices] - self.mean) / self.std dat_znorm = [paa(item, self.seq_len) for item in dat_znorm] features = [ tuple(ts_to_string(item, cuts_for_asize(self.symbols))) for item in dat_znorm ] for feature in features: if feature in self.feature: index = self.feature.index(feature) self.covered_dict[index] = True
def fitness(self, hidden, sym): activation = self.get_activations(hidden) dat_znorm = Z_ScoreNormalization( activation[:, self.testObjective.indices], self.testObjective.mean, self.testObjective.std) dat_znorm = [ paa(item, self.testObjective.seq_len) for item in dat_znorm ] cuts = cuts_for_asize(self.testObjective.symbols) cuts = np.append(cuts, np.array([np.inf])) sym_size = len(sym) out = np.array([ self.cal_fittness_seq(cuts, sym_size, sym, series) for series in dat_znorm ]) return out
def SAX(sequence: np.ndarray, alphabet_size: int, length: int = 0) -> str: """ Computes SAX string of a sequence of numbers with specified alphabet size. Length of the output string may be specified; length 0 will generate a string as long as the sequence. """ debug("Calculating SAX of {}, with alphabet of size {}".format( sequence, alphabet_size)) if alphabet_size == 1: if length == 0: return "a" * len(sequence) else: return "a" * length else: if length == 0 or length == len(sequence): return ts_to_string(znorm(sequence), cuts_for_asize(alphabet_size)) else: return ts_to_string(paa(znorm(sequence), length), cuts_for_asize(alphabet_size))
def extract_features( song_name): # returns mfcc and chroma features in SAX represetation try: x, fs = librosa.load(song_name) except: return None mfccs = librosa.feature.mfcc(x, sr=fs, n_mfcc=39) chroma = librosa.feature.chroma_stft(x, sr=fs) feature_matrix = np.concatenate((mfccs, chroma)) sax_rep = [] sax_rep = [ ts_to_string(paa(znorm(feat), SAX_VOCAB_LENGTH), cuts_for_asize(SAX_VOCAB_LENGTH)) for feat in feature_matrix ] return sax_rep
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', z_threshold=0.01): """Simple via window conversion implementation.""" # 生成指定size的alphabet cuts cuts = cuts_for_asize(alphabet_size) # 初始化sax sax = defaultdict(list) prev_word = '' for i in range(0, len(series) - win_size): # series被当前窗口所围住的子部分 sub_section = series[i:(i + win_size)] # 标准化 zn = znorm(sub_section, z_threshold) # PAA分段聚合 将子部分降维到paa_size维 paa_rep = paa(zn, paa_size) # PAA后的序列转化为字符串 curr_word = ts_to_string(paa_rep, cuts) # if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and\ is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
def update_features(self, hidden, test_num): activation = self.get_activations(hidden) dat_znorm = Z_ScoreNormalization( activation[:, self.testObjective.indices], self.testObjective.mean, self.testObjective.std) dat_znorm = [ paa(item, self.testObjective.seq_len) for item in dat_znorm ] features = [ tuple( ts_to_string(item, cuts_for_asize(self.testObjective.symbols))) for item in dat_znorm ] self.cov_count += 1 for feature in features: if feature in self.testObjective.feature: self.cov_count = 0 self.testObjective.feature.remove(feature) self.testObjective.covered_feature.append(feature) del self.testObjective.test_record[feature] self.coverage = 1 - len(self.testObjective.feature ) / self.testObjective.originalNumOfFeature cov_fitness = np.array([ self.fitness(hidden, listElem) for listElem in self.testObjective.feature ]) cov_index = np.min(cov_fitness, axis=1) cov_fitness = np.argmin(cov_fitness, axis=1) for idx, feature in enumerate(self.testObjective.feature): test_record = self.testObjective.test_record[feature] if test_record == None or test_record[1] > cov_fitness[idx]: self.testObjective.test_record[feature] = list( [test_num + cov_index[idx], cov_fitness[idx]]) self.displayCoverage()
def znorm_paa_sax(time_series, alpha, w=3, missing='z'): """Takes an array containing real values, z-normalizes, reduces dimensionality to w, and finally returns a sax representation of length alpha time series: array holding a time series of one measurement for one patient w: the dimensionality to reduce to using PAA, set to len(time_series) in plain alpha: alpha is the number of discretized segments that the SAX rep will reflect, set to 2, 3 or 5 in plain using RDS algo """ # If time_series is a string, make it into list format e.g. 'abc' -> ['a', 'b', 'c'] # why? because it's the structure we require for below and i CBA to change it if (isinstance(time_series, str)): time_series = list(time_series) if (len(time_series) > 0): # normalizing one time series, time series as numpy array (np.array([])) normalized_time_series = znorm(np.array(time_series)) # dimensionality reduction of time series according to w paa_norm_time_series = paa(normalized_time_series, w) # turning a discretized and reduced time series into a sequence of characters return ts_to_string(paa_norm_time_series, cuts_for_asize(alpha)) else: return missing
def process_cell(matrix, cell, progress_indicator): alphabet_size = cell[0] paa_division_integer = cell[1] progression = 0 if progress_indicator == -1 else progress_indicator print(progression) # Download or use downloaded weather data files time_avg_temp_arr = pm.get_weather_data_from_files() # get ground truth for the slope computed by the lin. regression model for all weather data files. ground_truth_arr = pm.get_ground_truth_values(time_avg_temp_arr) # mean array of periods mean_period_arr = [] # get flux, time, duration and ground thruth for i'th tuple ground_truth_slope = ground_truth_arr[progression] time_avg_temp_tuple = time_avg_temp_arr[progression] time = time_avg_temp_tuple[0] norm_avg_temp = time_avg_temp_tuple[1] dat_size = norm_avg_temp.size # PAA transformation procedure # Determine number of PAA points from the datasize devided by the paa_division_integer(number of points per segment) paa_points = int(dat_size / paa_division_integer) ## PAA transformation of data PAA_array = paa(norm_avg_temp, paa_points) PAA_array = np.asarray(PAA_array, dtype=np.float32) # SAX conversion # Get breakpoints to convert segments into SAX string breakPointsArray = pm.getBreakPointsArray(PAA_array, alphabet_size) sax_output = ts_to_string(PAA_array, breakPointsArray) # Convert to numeric SAX representation numericSaxConversionArray = pm.getNumericSaxArray(breakPointsArray) numeric_SAX_temp = [] for symbol_index in range(len(sax_output)): letter_represented_as_int = pm.getAlfabetToNumericConverter( sax_output[symbol_index], numericSaxConversionArray) numeric_SAX_temp.append(letter_represented_as_int) numeric_SAX_temp = np.asarray(numeric_SAX_temp, dtype=np.float32) numeric_SAX_time = time # Repeat each element in array x times, where x is the number of PAA points repeated_x_array = np.repeat(numeric_SAX_time, paa_points) # How many elements each list should have n = int(len(repeated_x_array) / paa_points) final_x_array = [] lists = list(pm.divide_array_in_chunks(repeated_x_array, n)) # take mean of all chunks for l in lists: final_x_array.append(np.mean(l)) numeric_SAX_time = final_x_array # Compute linear regression model slope, intercept, r_value, p_value, std_err = stats.linregress( numeric_SAX_time, numeric_SAX_temp) # Add error in percentage between best peiord and ground truth to array with periods ground_truth_error = (abs(slope - ground_truth_slope) / ground_truth_slope) * 100 # Update mean periods array if progression == 0: matrix[alphabet_size - MIN_SAX][paa_division_integer - MIN_PAA] = ground_truth_error else: # Update mean of particualr parameter combination current_value = matrix[alphabet_size - MIN_SAX][paa_division_integer - MIN_PAA] matrix[alphabet_size - MIN_SAX][paa_division_integer - MIN_PAA] = (current_value * progression + ground_truth_error) / (progression + 1)
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', znorm_threshold=0.01, sax_type='unidim'): """Simple via window conversion implementation. # SAX-ENERGY >>> sax_via_window([[1, 2, 3], [4, 5, 6]], win_size=1, paa_size=3, sax_type='energy', nr_strategy=None)['abc'] [0, 1] >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=1, paa_size=4, sax_type='energy', nr_strategy=None)['aacc'] [0, 1] >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=4, sax_type='energy', nr_strategy=None)['aaccaacc'] [0] # SAX-REPEAT >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=2, paa_size=2, sax_type='repeat', nr_strategy=None)['ab'] [0, 1] >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=1, paa_size=1, sax_type='repeat', nr_strategy=None)['a'] [0, 1, 2] # SAX-INDEPENDENT >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acacacac'] [0] >>> sax_via_window([[1, 2], [4, 5], [7, 8]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac'] [0, 1] >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac'] [0] >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acca'] [1] """ # Convert to numpy array. series = np.array(series) # Check on dimensions. if len(series.shape) > 2: raise ValueError('Please reshape time-series to stack dimensions along the 2nd dimension, so that the array shape is a 2-tuple.') # PAA size is the length of the PAA sequence. if sax_type != 'energy' and paa_size > win_size: raise ValueError('PAA size cannot be greater than the window size.') if sax_type == 'energy' and len(series.shape) == 1: raise ValueError('Must pass a multidimensional time-series to SAX-ENERGY.') # Breakpoints. cuts = cuts_for_asize(alphabet_size) # Dictionary mapping SAX words to indices. sax = defaultdict(list) if sax_type == 'repeat': # Maps indices to multi-dimensional SAX words. multidim_sax_dict = [] # List of all the multi-dimensional SAX words. multidim_sax_list = [] # Sliding window across time dimension. for i in range(series.shape[0] - win_size + 1): # Subsection starting at this index. sub_section = series[i: i + win_size] # Z-normalized subsection. if win_size == 1: zn = sub_section else: zn = znorm(sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, 'repeat') # SAX representation of subsection, but in terms of multi-dimensional vectors. multidim_sax = get_sax_list(paa_rep, cuts) # Update data-structures. multidim_sax_dict.append(multidim_sax) multidim_sax_list.extend(multidim_sax) # Cluster with k-means++. kmeans = KMeans(n_clusters=alphabet_size, random_state=0).fit(multidim_sax_list) # Cluster indices in sorted order. order = np.lexsort(np.rot90(kmeans.cluster_centers_)) # Sliding window across time dimension. prev_word = '' for i in range(series.shape[0] - win_size + 1): # Map cluster indices to new SAX letters. curr_word_list = map(lambda cluster_index: idx2letter(order[cluster_index]), kmeans.predict(multidim_sax_dict[i])) curr_word = ''.join(curr_word_list) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) else: # Sliding window across time dimension. prev_word = '' for i in range(series.shape[0] - win_size + 1): # Subsection starting at this index. sub_section = series[i: i + win_size] if sax_type == 'energy': curr_word = '' for energy_dist in sub_section: # Normalize energy distribution. energy_zn = znorm(energy_dist, znorm_threshold) # PAA representation of energy distribution. paa_rep = paa(energy_zn, paa_size, 'unidim') # paa_rep = energy_zn # SAX representation of the energy distribution. energy_word = ts_to_string(paa_rep, cuts) # Add to current word. curr_word += energy_word elif sax_type == 'independent': curr_word = '' for dim in range(sub_section.shape[1]): # Obtain the subsequence restricted to one dimension. one_dimension_sub_section = sub_section[:, dim] # Z-normalized subsection. zn = znorm(one_dimension_sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, 'unidim') # Get the SAX word - just a unidimensional SAX. one_dim_word = ts_to_string(paa_rep, cuts) # Add this dimensions' representation to the overall SAX word. curr_word += one_dim_word else: # Z-normalized subsection. zn = znorm(sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, sax_type) # SAX representation of subsection. curr_word = ts_to_string(paa_rep, cuts) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
def _compute_1_dim_str_rep(self, segment_ts, dim): cuts = self._cuts[dim] paa_rep = paa(segment_ts, paa_segments=1) letter = ts_to_string(paa_rep, cuts) return letter
from keras.optimizers import Adam ########## hist = yf.download(tickers = "DJI", period = 'max') words = [] dow_df = ent.util_pattern_space(hist_sma, lag = 1, dim = 50) dow_df = dow_df[:] for i in range(len(dow_df)): dat_znorm = znorm(dow_df[i,:]) dat_paa= paa(dat_znorm, 3) word = ts_to_string(dat_paa, cuts_for_asize(2)) words.append(word) print(words) print(collections.Counter(words)) from sklearn.preprocessing import LabelEncoder le=LabelEncoder() sqn = le.fit_transform(words) nb_classes = len(np.unique(sqn)) from keras.utils import to_categorical onehot = to_categorical(sqn)
def start_splitting(self, p_value: int, max_level: int, good_leaf_nodes: list(), bad_leaf_nodes: list()): """ Splitting Node Naive algorithm (k, P) Anonymity :param p_value: :param max_level: :param paa_value :return: """ # logger.info("good_leaf_nodes: {}, bad_leaf_nodes: {}".format(len(good_leaf_nodes), len(bad_leaf_nodes))) if self.size < p_value: logger.info("size:{}, p_value:{} == bad-leaf".format(self.size, p_value)) self.label = "bad-leaf" bad_leaf_nodes.append(self) return if self.level == max_level: logger.info("size:{}, p_value:{} == good-leaf".format(self.size, p_value)) self.label = "good-leaf" good_leaf_nodes.append(self) return if p_value <= self.size < 2*p_value: logger.info("Maximize-level, size:{}, p_value:{} == good-leaf".format(self.size, p_value)) self.maximize_level_node(max_level) self.label = "good-leaf" good_leaf_nodes.append(self) return """ Otherwise, we need to check if node N has to be split. The checking relies on a tentative split performed on N. Suppose that, by increasing the level of N, N is tentatively split into a number of child nodes. If all these child nodes contain fewer than P time series, no real split is performed and the original node N is labeled as good-leaf and the recursion terminates on N. Otherwise, there must exist tentative child node(s) whose size >= P, also called TG-node(s) (Tentative Good Nodes). The rest children whose size < P are called TB-nodes (Tentative Bad Nodes), if any. If the total number of records in all TB-nodes under N is no less than P, we merge them into a single tentative node, denoted by childmerge, at the level of N.level. If the above tentative process produces nc tentative child nodes (including TB and TG) and nc >= 2, N will really be split into nc children and then the node splitting procedure will be recursively invoked on each of them """ tentative_child_node = dict() temp_level = self.level + 1 for key, value in self.group.items(): # to reduce dimensionality data = np.array(value) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr = ts_to_string(data_paa, cuts_for_asize(temp_level)) if pr in tentative_child_node.keys(): tentative_child_node[pr].append(key) else: tentative_child_node[pr] = [key] length_all_tentative_child = [len(x) for x in list(tentative_child_node.values())] good_leaf = np.all(np.array(length_all_tentative_child) < p_value) if good_leaf: logger.info("Good-leaf, all_tentative_child are < {}".format(p_value)) self.label = "good-leaf" good_leaf_nodes.append(self) return else: logger.info("N can be split") logger.info("Compute tentative good nodes and tentative bad nodes") # tentative good nodes # index of nodes in tentative_child_node with more p_value pr_keys = list(tentative_child_node.keys()) # get index tentative good node pattern_representation_tg = list() tg_nodes_index = list(np.where(np.array(length_all_tentative_child) >= p_value)[0]) # logger.info(pr_keys) tg_nodes = list() for index in tg_nodes_index: keys_elements = tentative_child_node[pr_keys[index]] dict_temp = dict() for key in keys_elements: dict_temp[key] = self.group[key] tg_nodes.append(dict_temp) pattern_representation_tg.append(pr_keys[index]) # tentative bad nodes tb_nodes_index = list(np.where(np.array(length_all_tentative_child) < p_value)[0]) tb_nodes = list() pattern_representation_tb = list() for index in tb_nodes_index: keys_elements = tentative_child_node[pr_keys[index]] dict_temp = dict() for key in keys_elements: dict_temp[key] = self.group[key] tb_nodes.append(dict_temp) pattern_representation_tb.append(pr_keys[index]) total_size_tb_nodes = 0 for tb_node in tb_nodes: total_size_tb_nodes += len(tb_node) if total_size_tb_nodes >= p_value: logger.info("Merge all bad nodes in a single node, and label it as good-leaf") child_merge_node_group = dict() for tb_node in tb_nodes: for key, value in tb_node.items(): child_merge_node_group[key] = value node_merge = Node(level=self.level, pattern_representation=self.pattern_representation, label="good-leaf", group=child_merge_node_group, parent=self) self.child_node.append(node_merge) good_leaf_nodes.append(node_merge) nc = len(tg_nodes) + len(tb_nodes) # tb_nodes sono un po' perplesso su questo tb_nodes logger.info("Split only tg_nodes {0}".format(len(tg_nodes))) if nc >= 2: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="intermediate", group=tg_nodes[index], parent=self) self.child_node.append(node) node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes) else: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="good-leaf", group=tg_nodes[index], parent=self) self.child_node.append(node) good_leaf_nodes.append(node) else: nc = len(tg_nodes) + len(tb_nodes) # tb_nodes sono un po' perplesso su questo tb_nodes logger.info("Label all tb_node {0} as bad-leaf and split only tg_nodes {1}".format(len(tb_nodes),len(tg_nodes))) for index in range(0, len(tb_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tb[index], label="bad-leaf", group=tb_nodes[index], parent=self) self.child_node.append(node) bad_leaf_nodes.append(node) if nc >= 2: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="intermediate", group=tg_nodes[index], parent=self) self.child_node.append(node) node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes) else: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="good-leaf", group=tg_nodes[index], parent=self) self.child_node.append(node) good_leaf_nodes.append(node)
def main(argv): #load configuration parameters = load_configuration() #load parameters #dataset path_to_dataset = parameters['path_to_dataset'] load_size = parameters['load_size'] #SAX alphabet_size = parameters['alphabet_size'] paa_size = parameters['paa_size'] window_size = parameters['window_size'] step = parameters['step'] substring_size = parameters['substring_size'] #smoothing threshold_freq = parameters['threshold_freq'] #projections prj_size = parameters['prj_size'] prj_iterations = parameters['prj_iterations'] anomaly_threshold = parameters['anomaly_threshold'] #loading data loader = DataLoader.DataLoader(path_to_dataset) data = DataTypes.Data() #loader.load_all(data,200) loader.load_subset(data, load_size, 100) #period from which extract anomalies begin_date = datetime.datetime.fromtimestamp(data.index_to_time[0]) end_date = datetime.datetime.fromtimestamp(data.index_to_time[load_size - 1]) if parameters['power_type'] == -1: tank = parameters['tank'] sensor_type = parameters['sensor_type'] #print(data.measures[0]) print("Loading of %i tank %i data from %s to %s " % (sensor_type, tank, begin_date, end_date)) s_values = [ data.measures[i][0][tank][sensor_type] for i in range(0, len(data.measures)) ] else: power_type = parameters['power_type'] print("Loading measures of power %i from %s to %s " % (power_type, begin_date, end_date)) s_values = [ data.measures[i][1][power_type] for i in range(0, len(data.measures)) ] len_serie = len(s_values) hash_table_substrings = {} #getting first n alphabet letters alphabet = get_alphabet_letters(alphabet_size) #creating hash table indexed by all of substrings of length k hash_table_substrings = get_hash_table(alphabet, prj_size) #list containg score for each window anomalies_score = [] for index in range(0, len_serie, step): begin = index end = begin + window_size if end < len_serie: window_values = s_values[begin:end] window_znorm = znorm(s_values) window_paa = paa(window_znorm, paa_size) window_string = ts_to_string(window_paa, cuts_for_asize(alphabet_size)) #each character of the string corresponds to k values of the series k = window_size // paa_size #get smoothed string window_smoothed = smoothing(window_string, threshold_freq) #fill hash table by applying random projection hash_table_substrings = put_in_bucket(hash_table_substrings, window_smoothed, begin, prj_iterations, prj_size, substring_size, k) total = 0 for key, values in hash_table_substrings.items(): total = total + len(values) buckets_with_anomalies, bucket_freq = analyzed_bucket( hash_table_substrings, total, anomaly_threshold) #number of bucket with anomalies n_buckets_anomalies = len(buckets_with_anomalies.keys()) #getting score for current window avg_window_score = getting_score(hash_table_substrings, buckets_with_anomalies, n_buckets_anomalies) anomalies_score.append(avg_window_score) #reset table hash_table_substrings = get_hash_table(alphabet, prj_size) else: break print(anomalies_score)
with open(sys.argv[1], 'r') as h: lines = h.readlines() DATA = [] time_series = [] for line in lines: line = line.strip() if line != 'null' and line != '\n': time_series.append(float(line)) else: DATA.append(time_series) time_series = [] for data in DATA: data = np.asfarray(data, float) data = np.diff(data) data_znorm = znorm(data) data_paa = paa(data_znorm, w) sax_words.append(ts_to_string(data_paa, cuts_for_asize(a))) # sax_words_ri = [] # i = 0 # for word in sax_words: # perms = set([''.join(p) for p in permutations(word)]) # sax_words_ri.append(perms) # i+=1 # Write all SAX words to file only once instead of generating again and again with open('sax_words_ri_norot_w=' + str(w) + '_a=' + str(a), 'w+') as sax_words_file: for l in sax_words: sax_words_file.write(l + '\n') # sax_words_file.write('\n')
def sax_by_chunking(series, paa_size, alphabet_size=3, z_threshold=0.01): """Simple chunking conversion implementation.""" paa_rep = paa(znorm(series, z_threshold), paa_size) cuts = cuts_for_asize(alphabet_size) return ts_to_string(paa_rep, cuts)
def visualize_p_anonymized_nodes(nodes_list: List[Node]): # https://stackoverflow.com/questions/50161140/how-to-plot-a-time-series-array-with-confidence-intervals-displayed-in-python # https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib pr_dict: Dict[str, int] = {} number_of_pr: int = 0 for node in nodes_list: if node.pr not in pr_dict and node.pr != "a" * node.pr_len(): pr_dict[node.pr] = number_of_pr number_of_pr += 1 pr_cmap = get_cmap(len(pr_dict) + 1) n = len(nodes_list[0].table[0]) if nodes_list[0].pr_len() != n: paa_linespace = np.linspace(0, n - 1, (2 * nodes_list[0].pr_len() + 1)) paa_positions = paa_linespace[1::2] for node in nodes_list: if node.pr != "a" * node.pr_len(): node_color = pr_cmap(pr_dict[node.pr]) marker_alpha = 1 line_alpha = 0.4 else: node_color = "grey" marker_alpha = 0.5 line_alpha = 0.2 for row in node.table: plt.plot(range(n), row, color=node_color, label=node.pr, alpha=line_alpha) plt.plot(paa_positions, paa(row, node.pr_len()), color=node_color, label="", linestyle='', marker="_", markeredgewidth=2, markersize=10, alpha=marker_alpha) plt.plot(paa_positions, paa(row, node.pr_len()), color=node_color, label="", linestyle=':', alpha=0.5) else: for node in nodes_list: if node.pr != "a" * node.pr_len(): node_color = pr_cmap(pr_dict[node.pr]) line_alpha = 1 else: node_color = "grey" line_alpha = 0.5 for row in node.table: plt.plot(range(n), row, color=node_color, label=node.pr, alpha=line_alpha) fontP = FontProperties() fontP.set_size('xx-small') # https://stackoverflow.com/questions/13588920/stop-matplotlib-repeating-labels-in-legend handles, labels = plt.gca().get_legend_handles_labels() by_label = dict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys(), loc='upper left', ncol=3, prop=fontP) plt.title("p-anonymization") plt.yscale("symlog") plt.show() return
def PAA_aggregation(v): dat_znorm = znorm(v) r = paa(dat_znorm, len(v)) #r = znorm(v) print("MIN MAX",min(r),max(r)) return r
def recycle_bad_leaves(p_value, good_leaf_nodes, bad_leaf_nodes, suppressed_nodes, paa_value): """ Recycle bad-leaves phase :param bad_leaf_nodes: [description] """ bad_leaf_nodes_dict = dict() for node in bad_leaf_nodes: if node.level in bad_leaf_nodes_dict.keys(): bad_leaf_nodes_dict[node.level].append(node) else: bad_leaf_nodes_dict[node.level] = [node] bad_leaf_nodes_size = sum([node.size for node in bad_leaf_nodes]) if bad_leaf_nodes_size >= p_value: # max bad level current_level = max(bad_leaf_nodes_dict.keys()) while bad_leaf_nodes_size >= p_value: if current_level in bad_leaf_nodes_dict.keys(): merge_dict = dict() keys_to_be_removed = list() merge = False for current_level_node in bad_leaf_nodes_dict[ current_level]: pr_node = current_level_node.pattern_representation if pr_node in merge_dict.keys(): merge = True merge_dict[pr_node].append(current_level_node) if pr_node in keys_to_be_removed: keys_to_be_removed.remove(pr_node) else: merge_dict[pr_node] = [current_level_node] keys_to_be_removed.append(pr_node) if merge: for k in keys_to_be_removed: del merge_dict[k] for pr, node_list in merge_dict.items(): group = dict() for node in node_list: bad_leaf_nodes_dict[current_level].remove(node) group.update(node.group) if current_level > 1: level = current_level else: level = 1 leaf_merge = Node(level=level, pattern_representation=pr, group=group, paa_value=paa_value) if leaf_merge.size >= p_value: leaf_merge.label = "good-leaf" good_leaf_nodes.append(leaf_merge) bad_leaf_nodes_size -= leaf_merge.size else: leaf_merge.label = "bad-leaf" bad_leaf_nodes_dict[current_level].append( leaf_merge) temp_level = current_level - 1 for node in bad_leaf_nodes_dict[current_level]: if temp_level > 1: values_group = list(node.group.values()) data = np.array(values_group[0]) data_znorm = znorm(data) data_paa = paa(data_znorm, paa_value) pr = ts_to_string(data_paa, cuts_for_asize(temp_level)) else: pr = "a" * paa_value node.level = temp_level node.pattern_representation = pr if current_level > 0: if temp_level not in bad_leaf_nodes_dict.keys(): bad_leaf_nodes_dict[ temp_level] = bad_leaf_nodes_dict.pop( current_level) else: bad_leaf_nodes_dict[temp_level] = bad_leaf_nodes_dict[ temp_level] + bad_leaf_nodes_dict.pop( current_level) current_level -= 1 else: break #print("sopprimo le serie rimanenti") remaining_bad_leaf_nodes = list(bad_leaf_nodes_dict.values())[0] for node in remaining_bad_leaf_nodes: suppressed_nodes.append(node)
def process_cell(matrix, cell, progress_indicator): alphabet_size = cell[0] paa_division_integer = cell[1] progression = 0 if progress_indicator == -1 else progress_indicator # Download or use downloaded lightcurve files time_flux_tuple_arr = pm.get_lightcurve_data() # get ground truth values for all lc's with autocorrelation ground_truth_arr = pm.get_ground_truth_values(time_flux_tuple_arr) # transform durations from exoplanet archieve from hours to days actual_duration_arr = [ 3.88216 / 24, 2.36386 / 24, 3.98235 / 24, 4.56904 / 24, 3.60111 / 24, 5.16165 / 24, 3.19843 / 24 ] ##kepler-2,3,4,5,6,7,8 https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=cumulative #mean array of periods mean_period_arr = [] # Calculate matrix values for all lighcurves # get flux, time, duration and ground thruth for i'th tuple ground_truth_period = ground_truth_arr[progression] actual_duration = actual_duration_arr[progression] time_flux_tuple = time_flux_tuple_arr[progression] time = time_flux_tuple[0] norm_fluxes = time_flux_tuple[1] dat_size = norm_fluxes.size # Find Period for eaxh parameter combination alphabets_size/PAA_division_interger of SAX # PAA transformation procedure # Determine number of PAA points from the datasize devided by the paa_division_integer(number of points per segment) paa_points = int(dat_size / paa_division_integer) # PAA transformation of data PAA_array = paa(norm_fluxes, paa_points) PAA_array = np.asarray(PAA_array, dtype=np.float32) # SAX conversion # Get breakpoints to convert segments into SAX string breakPointsArray = pm.getBreakPointsArray(PAA_array, alphabet_size) sax_output = ts_to_string(PAA_array, breakPointsArray) # Convert to numeric SAX representation numericSaxConversionArray = pm.getNumericSaxArray(breakPointsArray) numeric_SAX_flux = [] for symbol_index in range(len(sax_output)): letter_represented_as_int = pm.getAlfabetToNumericConverter( sax_output[symbol_index], numericSaxConversionArray) numeric_SAX_flux.append(letter_represented_as_int) numeric_SAX_flux = np.asarray(numeric_SAX_flux, dtype=np.float32) numeric_SAX_time = time # Repeat each element in array x times, where x is the number of PAA points repeated_x_array = np.repeat(numeric_SAX_time, paa_points) # How many elements each list should have n = int(len(repeated_x_array) / paa_points) final_x_array = [] lists = list(pm.divide_array_in_chunks(repeated_x_array, n)) # take mean of all chunks for l in lists: final_x_array.append(np.mean(l)) numeric_SAX_time = final_x_array # BoxLeastSquares applied to numeric SAX representation BLS = BoxLeastSquares(numeric_SAX_time, numeric_SAX_flux) periodogram = BLS.autopower(actual_duration) # Find period with highest power in periodogram best_period = np.argmax(periodogram.power) period = periodogram.period[best_period] # Add error in percentage between best peiord and ground truth to array with periods ground_truth_error = (abs(period - ground_truth_period) / ground_truth_period) * 100 # Update matrix if progression == 0: matrix[alphabet_size - MIN_SAX][paa_division_integer - MIN_PAA] = ground_truth_error else: #Update mean of particualr parameter combination current_value = matrix[alphabet_size - MIN_SAX][paa_division_integer - MIN_PAA] matrix[alphabet_size - MIN_SAX][paa_division_integer - MIN_PAA] = (current_value * progression + ground_truth_error) / (progression + 1)
from saxpy.alphabet import cuts_for_asize df = [] for year in range(1994, 2019): hist = yf.download(tickers="SPY", start="{}-01-01".format(year), end="{}-12-31".format(year)) close = hist["Close"] df.append(close) words = [] for year in tqdm(df): #dat = ent.util_granulate_time_series(year, scale=3) dat_znorm = znorm(year) dat_paa = paa(dat_znorm, 10) word = ts_to_string(dat_paa, cuts_for_asize(5)) words.append(word) print(words) from collections import Counter Counter(words) from collections import Counter years = np.arange(1994, 2019) frame = pd.DataFrame() frame["Year"] = years frame["Word"] = words print(frame) from fuzzywuzzy import fuzz