#data = load_data('chlorine') #inserted anomaly #data[1500:2000, 0:4] = 0.7 #data = data[:,:10] #data = D['data'] ## Missout low valued TS #mask = data.mean(axis=0) > 50 #data = data[:, mask] #data = load_ts_data('isp_routers', 'full') # Z score data data = zscore(data) #data = zscore_win(data, 250) # Fix Nans whereAreNaNs = np.isnan(data) data[whereAreNaNs] = 0 # old rank adaptation - thresholds e_high = 0.99 e_low = 0.90 alpha = 0.98 # New rank adaptation - EWMA F_min = 0.92 epsilon = 0.02 EW_mean_alpha = 0.1 # for incremental mean
def SAX(data, alphabet_size, word_size, minstd=1.0, pre_normed=False): """ Returns one word for each data stream word_size == Number of segments data is split into for PAA alphabet_size == Number of symbols used """ num_streams = data.shape[1] # Need to insert check here for stationary segemnts mask = data.std(axis=0) < minstd passed = np.invert(mask) if np.any(mask): # Scale data to have a mean of 0 and a standard deviation of 1. if pre_normed == False: data[:, passed] = zscore(data[:, passed]) symbol4skips = string.ascii_letters[int(np.ceil(alphabet_size / 2.))] else: # Scale data to have a mean of 0 and a standard deviation of 1. if pre_normed == False: data = zscore(data) # Calculate our breakpoint locations. breakpoints = bp_lookup(alphabet_size) breakpoints = np.concatenate((breakpoints, np.array([np.Inf]))) # Split the data into a list of word_size pieces. data = np.array_split(data, word_size, axis=0) # Predifine Matrices segment_means = np.zeros((word_size, num_streams)) #segment_symbol = np.zeros((word_size,num_streams), dtype = np.str) p_array = np.zeros((num_streams, ), dtype=('a1,' * word_size + 'i2')) p_dict = {} # Calculate the mean for each section. for i in range(word_size): segment_means[i, passed] = data[i][:, passed].mean(axis=0) # Figure out which break each section is in based on the section_means and # calculated breakpoints. for i in range(num_streams): for j in range(word_size): if passed[i]: idx = int(np.where(breakpoints > segment_means[j, i])[0][0]) # Store in phrase_array p_array[i][j] = string.ascii_letters[idx] else: p_array[i][j] = symbol4skips # Store in phrase_dict phrase = ''.join(tuple(p_array[i])[:word_size]) if p_dict.has_key(phrase): p_dict[phrase].append(i) else: p_dict[phrase] = [i] # Put frequency of pattern in p_array for vals in p_dict.itervalues(): count = len(vals) for i in range(count): p_array[vals[i]][-1] = count return p_array, p_dict, segment_means
#inserted anomaly data[1500:2000, 10:20] = 0.0 #data = data[:,:20] #data = D['data'] ## Missout low valued TS #mask = data.mean(axis=0) > 50 #data = data[:, mask] #data = load_ts_data('isp_routers', 'mid') # Z score data data = zscore(data) #data = zscore_win(data, 250) # Fix Nans whereAreNaNs = np.isnan(data) data[whereAreNaNs] = 0 # old rank adaptation - thresholds e_high = 0.99 e_low = 0.94 alpha = 0.96 # New rank adaptation - EWMA F_min = 0.9 epsilon = 0.05 EW_mean_alpha = 0.1 # for incremental mean
def SAX(data, alphabet_size, word_size, minstd = 1.0, pre_normed = False): """ Returns one word for each data stream word_size == Number of segments data is split into for PAA alphabet_size == Number of symbols used """ num_streams = data.shape[1] # Need to insert check here for stationary segemnts mask = data.std(axis=0) < minstd passed = np.invert(mask) if np.any(mask): # Scale data to have a mean of 0 and a standard deviation of 1. if pre_normed == False: data[:,passed] = zscore(data[:, passed]) symbol4skips = string.ascii_letters[int(np.ceil(alphabet_size/2.))] else: # Scale data to have a mean of 0 and a standard deviation of 1. if pre_normed == False: data = zscore(data) # Calculate our breakpoint locations. breakpoints = bp_lookup(alphabet_size) breakpoints = np.concatenate((breakpoints, np.array([np.Inf]))) # Split the data into a list of word_size pieces. data = np.array_split(data, word_size, axis=0) # Predifine Matrices segment_means = np.zeros((word_size,num_streams)) #segment_symbol = np.zeros((word_size,num_streams), dtype = np.str) p_array = np.zeros((num_streams,),dtype = ('a1,' * word_size + 'i2')) p_dict = {} # Calculate the mean for each section. for i in range(word_size): segment_means[i,passed] = data[i][:,passed].mean(axis = 0) # Figure out which break each section is in based on the section_means and # calculated breakpoints. for i in range(num_streams): for j in range(word_size): if passed[i]: idx = int(np.where(breakpoints > segment_means[j,i])[0][0]) # Store in phrase_array p_array[i][j] = string.ascii_letters[idx] else: p_array[i][j] = symbol4skips # Store in phrase_dict phrase = ''.join(tuple(p_array[i])[:word_size]) if p_dict.has_key(phrase): p_dict[phrase].append(i) else: p_dict[phrase] = [i] # Put frequency of pattern in p_array for vals in p_dict.itervalues(): count = len(vals) for i in range(count): p_array[vals[i]][-1] = count return p_array, p_dict, segment_means