def ComputeSax(sample_data, sample_data2): sample_data = sample_data.as_matrix() sample_data2 = sample_data2.as_matrix() ######################################### # SAX - Symbolic aggregate approximation #http://www.cs.ucr.edu/~eamonn/SAX.pdf ########################################## #PARAMETERS: #W: The number of PAA segments representing the time series - aka the len() # of the string representing the timeseries - useful for dimensionality reduction #Alphabet size: Alphabet size (e.g., for the alphabet = {a,b,c} = 3) downsample_ratio = 200 word_length = len(sample_data[:, 1]) / downsample_ratio alphabet_size = 7 s = SAX(word_length, alphabet_size) mic_distances = [] for mic in range(1, 5): (x1String, x1Indices) = s.to_letter_rep(sample_data[:, mic]) (x2String, x2Indices) = s.to_letter_rep(sample_data2[:, mic]) #print x1String x1x2ComparisonScore = s.compare_strings(x1String, x2String) mic_distances.append(x1x2ComparisonScore) #print "Mic: " + str(mic) + ", distance= " + str(x1x2ComparisonScore) return mic_distances
class TestSAX(object): def setUp(self): # All tests will be run with 6 letter words # and 5 letter alphabet self.sax = SAX(6, 5, 1e-6) def test_to_letter_rep(self): arr = [7, 1, 4, 4, 4, 4] (letters, indices) = self.sax.to_letter_rep(arr) assert letters == 'eacccc' def test_long_to_letter_rep(self): long_arr = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 10, 100 ] (letters, indices) = self.sax.to_letter_rep(long_arr) assert letters == 'bbbbce' def test_compare_strings(self): base_string = 'aaabbc' similar_string = 'aabbbc' dissimilar_string = 'ccddbc' similar_score = self.sax.compare_strings(base_string, similar_string) dissimilar_score = self.sax.compare_strings(base_string, dissimilar_string) assert similar_score < dissimilar_score
class TestSAX(object): def setUp(self): # All tests will be run with 6 letter words # and 5 letter alphabet self.sax = SAX(6, 5, 1e-6) def test_to_letter_rep(self): arr = [7, 1, 4, 4, 4, 4] (letters, indices, letter_boundries) = self.sax.to_letter_rep(arr) assert letters == 'eacccc' def test_long_to_letter_rep(self): long_arr = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 10, 100 ] (letters, indices, letter_boundries) = self.sax.to_letter_rep(long_arr) assert letters == 'bbbbce' def test_compare_strings(self): base_string = 'aaabbc' similar_string = 'aabbbc' dissimilar_string = 'ccddbc' similar_score = self.sax.compare_strings(base_string, similar_string) dissimilar_score = self.sax.compare_strings(base_string, dissimilar_string) assert similar_score < dissimilar_score def test_from_letter_rep(self): arr = [7, 1, 4, 4, 4, 4] (letters, indices, letter_boundries) = self.sax.to_letter_rep(arr) reconstructed = self.sax.from_letter_rep(letters, indices, letter_boundries) assert allclose(reconstructed, [6.21, 1.78, 4.0, 4.0, 4.0, 4.0], atol=0.01) def test_breakpoints(self): assert allclose(self.sax.breakpoints(3), [-0.43, 0.43], atol=0.01) assert allclose(self.sax.breakpoints(2), [0], atol=0.01) assert allclose(self.sax.breakpoints(20), [ -1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64 ], atol=0.01) def test_interval_centres(self): assert allclose(self.sax.interval_centres(2), [-0.67, 0.67], atol=0.01) assert allclose(self.sax.interval_centres(3), [-0.96, 0.0, 0.96], atol=0.01) assert allclose(self.sax.interval_centres(30), [ -2.12, -1.64, -1.38, -1.19, -1.03, -0.90, -0.78, -0.67, -0.57, -0.47, -0.38, -0.29, -0.21, -0.12, -0.04, 0.04, 0.12, 0.21, 0.29, 0.38, 0.47, 0.57, 0.67, 0.78, 0.90, 1.03, 1.19, 1.38, 1.64, 2.12 ], atol=0.01)
class TestSAX(object): def setUp(self): # All tests will be run with 6 letter words # and 5 letter alphabet self.sax = SAX(6, 5, 1e-6) def test_to_letter_rep(self): arr = [7, 1, 4, 4, 4, 4] (letters, indices) = self.sax.to_letter_rep(arr) assert letters == 'eacccc' def test_to_letter_rep_missing(self): arr = [7, 1, 4, 4, np.nan, 4] (letters, indices) = self.sax.to_letter_rep(arr) assert letters == 'eacc-c' def test_long_to_letter_rep(self): long_arr = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 10, 100 ] (letters, indices) = self.sax.to_letter_rep(long_arr) assert letters == 'bbbbce' def test_long_to_letter_rep_missing(self): long_arr = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, np.nan, 1, 1, 6, 6, 6, 6, 10, 100 ] (letters, indices) = self.sax.to_letter_rep(long_arr) assert letters == 'bbb-ce' def test_compare_strings(self): base_string = 'aaabbc' similar_string = 'aabbbc' dissimilar_string = 'ccddbc' similar_score = self.sax.compare_strings(base_string, similar_string) dissimilar_score = self.sax.compare_strings(base_string, dissimilar_string) assert similar_score < dissimilar_score def test_compare_strings_missing(self): assert self.sax.compare_strings('a-b-c-', 'b-c-d-') == 0 def test_normalize_missing(self): # two arrays which should normalize to the same result # except one should contain a nan value in place of the input nan value incomplete_arr_res = self.sax.normalize([1, 0, 0, 0, 0, 1, np.nan]) complete_arr_res = self.sax.normalize([1, 0, 0, 0, 0, 1]) assert np.array_equal(incomplete_arr_res[:-1], complete_arr_res) assert np.isnan(incomplete_arr_res[-1]) def test_normalize_under_epsilon(self): array_under_epsilon = self.sax.normalize([1e-7, 2e-7, 1.5e-7]) assert np.array_equal(array_under_epsilon, [0, 0, 0])
def _get_SAX_spikes(cls, timeseries, timestamps, treshold): """ Returns spikes counting how many times a timestamp is a maximum in a SAX conversion """ # Seconds bethween measurements retention = (timestamps[-1] - timestamps[0]) / len(timestamps) # Number of entries per window entries_per_word = cls.WINDOW_SECONDS_COUNT / retention num_windows = len(timeseries) / entries_per_word window_size = len(timeseries) / num_windows num_symbols = window_size * retention / cls.SECONDS_PER_SYMBOL sax_generator = SAX(wordSize=num_symbols, alphabetSize=cls.ALPHABET_SIZE) symbols_per_datapoint = int( round(cls.SECONDS_PER_SYMBOL / float(retention))) # Convert timeseries into SAX notation words, intervals = sax_generator.sliding_window( timeseries, num_windows, .8) # Times index i is a maximal value maximum_count = {i: 0 for i in xrange(len(timeseries))} # Times index i is passed by a window window_count = {i: 0 for i in xrange(len(timeseries))} # Count in how many windows a timestamp is a local maximum for i in xrange(len(words)): word = words[i] interval = intervals[i] for j in xrange(len(word)): index = j * symbols_per_datapoint + interval[0] if word[j] == string.ascii_lowercase[cls.ALPHABET_SIZE - 1]: maximum_count[index] += 1 window_count[index] += 1 spikes = {} for key, value in maximum_count.iteritems(): if value == window_count[key] and value and \ timeseries[key] > treshold: val = timeseries[key] spikes[timestamps[key]] = cls._get_basic_spike_prio( val, treshold) return spikes
def saxify_and_export(df, csvf, alphabet=5): nrows, ncols = df.shape sample_size = ncols - 1 sax = SAX(sample_size, alphabet, 1e-6) cols = ['label', 'sax'] nv = [] for i in range(nrows): values = df.iloc[i, 1:].values.tolist() v = {} v['label'] = int(df.iloc[i, 0]) letters, _ = sax.to_letter_rep(values) v['sax'] = letters nv.append(v) return pd.DataFrame(nv, columns=cols).to_csv(csvf, index=False)
class TestSAX(object): def setUp(self): # All tests will be run with 6 letter words # and 5 letter alphabet self.sax = SAX(6, 5, 1e-6) def test_to_letter_rep(self): arr = [7,1,4,4,4,4] (letters, indices) = self.sax.to_letter_rep(arr) assert letters == 'eacccc' def test_to_letter_rep_missing(self): arr = [7,1,4,4,np.nan,4] (letters, indices) = self.sax.to_letter_rep(arr) assert letters == 'eacc-c' def test_long_to_letter_rep(self): long_arr = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,6,6,6,6,10,100] (letters, indices) = self.sax.to_letter_rep(long_arr) assert letters == 'bbbbce' def test_long_to_letter_rep_missing(self): long_arr = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,np.nan,1,1,6,6,6,6,10,100] (letters, indices) = self.sax.to_letter_rep(long_arr) assert letters == 'bbb-ce' def test_compare_strings(self): base_string = 'aaabbc' similar_string = 'aabbbc' dissimilar_string = 'ccddbc' similar_score = self.sax.compare_strings(base_string, similar_string) dissimilar_score = self.sax.compare_strings(base_string, dissimilar_string) assert similar_score < dissimilar_score def test_compare_strings_missing(self): assert self.sax.compare_strings('a-b-c-', 'b-c-d-') == 0 def test_normalize_missing(self): # two arrays which should normalize to the same result # except one should contain a nan value in place of the input nan value incomplete_arr_res = self.sax.normalize([1,0,0,0,0,1,np.nan]) complete_arr_res = self.sax.normalize([1,0,0,0,0,1]) assert np.array_equal(incomplete_arr_res[:-1], complete_arr_res) assert np.isnan(incomplete_arr_res[-1]) def test_normalize_under_epsilon(self): array_under_epsilon = self.sax.normalize([1e-7, 2e-7, 1.5e-7]) assert np.array_equal(array_under_epsilon, [0,0,0])
def _get_SAX_spikes(cls, timeseries, timestamps, treshold): """ Returns spikes counting how many times a timestamp is a maximum in a SAX conversion """ # Seconds bethween measurements retention = (timestamps[-1] - timestamps[0]) / len(timestamps) # Number of entries per window entries_per_word = cls.WINDOW_SECONDS_COUNT / retention num_windows = len(timeseries) / entries_per_word window_size = len(timeseries) / num_windows num_symbols = window_size * retention / cls.SECONDS_PER_SYMBOL sax_generator = SAX(wordSize=num_symbols, alphabetSize=cls.ALPHABET_SIZE) symbols_per_datapoint = int(round(cls.SECONDS_PER_SYMBOL / float(retention))) # Convert timeseries into SAX notation words, intervals = sax_generator.sliding_window(timeseries, num_windows, 0.8) # Times index i is a maximal value maximum_count = {i: 0 for i in xrange(len(timeseries))} # Times index i is passed by a window window_count = {i: 0 for i in xrange(len(timeseries))} # Count in how many windows a timestamp is a local maximum for i in xrange(len(words)): word = words[i] interval = intervals[i] for j in xrange(len(word)): index = j * symbols_per_datapoint + interval[0] if word[j] == string.ascii_lowercase[cls.ALPHABET_SIZE - 1]: maximum_count[index] += 1 window_count[index] += 1 spikes = {} for key, value in maximum_count.iteritems(): if value == window_count[key] and value and timeseries[key] > treshold: val = timeseries[key] spikes[timestamps[key]] = cls._get_basic_spike_prio(val, treshold) return spikes
def __init__(self, segmentLength=20, paaSize=5, alphabetSize=3, upperBound=100, lowerBound=-100): self.segmentLength = segmentLength self.paaSize = paaSize self.alphabetSize = alphabetSize self.upperBound = upperBound self.lowerBound = lowerBound self.sax = SAX(wordSize=paaSize, alphabetSize=alphabetSize, lowerBound=lowerBound, upperBound=upperBound, epsilon=1e-6) self.grammar = Grammar() self.segmentIndexes = [] self.rule_set = [] self.tsCount = 0
def sax_kmeans(X, K, wordSize, alphabetSize): '''Cluster by SAX k-means Args: X: 2D np array of dimension (n_households, time) K: Number of clusters See https://github.com/nphoff/saxpy Returns: List of K centroids List of SAX k-means cluster assignments for each load in X ''' np.random.seed(NUM) # Initialize to K random centers sax = SAX(wordSize=wordSize, alphabetSize=alphabetSize) idx = np.random.randint(X.shape[0], size=K) xmu = list(X[idx, :]) mu = [] for i in range(len(xmu)): mu.append(sax.to_letter_rep(xmu[i])[0]) oldmu = [] strX = [] for i in range(X.shape[0]): strX.append(sax.to_letter_rep(X[i])[0]) #i = 1 while not has_converged(mu, oldmu): oldmu = mu # Assign all points in X to clusters clusters, mu_ind = cluster_points(X, strX, mu, sax) # Reevaluate centers mu = reevaluate_centers(oldmu, clusters, sax) return mu, mu_ind
class TestSAX(object): def setUp(self): # All tests will be run with 6 letter words # and 5 letter alphabet self.sax = SAX(6, 5, 1e-6) def test_to_letter_rep(self): arr = [7, 1, 4, 4, 4, 4] (letters, indices) = self.sax.to_letter_rep(arr) assert letters == "eacccc" def test_long_to_letter_rep(self): long_arr = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 10, 100] (letters, indices) = self.sax.to_letter_rep(long_arr) assert letters == "bbbbce" def test_compare_strings(self): base_string = "aaabbc" similar_string = "aabbbc" dissimilar_string = "ccddbc" similar_score = self.sax.compare_strings(base_string, similar_string) dissimilar_score = self.sax.compare_strings(base_string, dissimilar_string) assert similar_score < dissimilar_score
class SymbolicClustering(object): def __init__(self, segmentLength=20, paaSize=5, alphabetSize=3, upperBound=100, lowerBound=-100): self.segmentLength = segmentLength self.paaSize = paaSize self.alphabetSize = alphabetSize self.upperBound = upperBound self.lowerBound = lowerBound self.sax = SAX(wordSize=paaSize, alphabetSize=alphabetSize, lowerBound=lowerBound, upperBound=upperBound, epsilon=1e-6) self.grammar = Grammar() self.segmentIndexes = [] self.rule_set = [] self.tsCount = 0 def printSegments(self): print("\nCurrent Segments:") for segmentIndex in self.segmentIndexes: segmentIndex.printContent() def discretize(self, s): """ @description : discretize the single seires using modified PAA method. --------- @param : s -- timeseries in array format ------- @Returns : a list of segments which are discretized from the input. ------- """ n = len(s) segments = [] if n % self.segmentLength != 0: raise SegmentsCanNotBeEquallyDivided() nSegment = int(n / self.segmentLength) for i in range(0, nSegment): start = i * self.segmentLength end = (i + 1) * self.segmentLength if self.tsCount == 0: self.segmentIndexes.append(SegmentIndex((start, end))) (letters, indices) = self.sax.to_letter_rep_ori(s[start:end]) segment = Segment(s[start:end], letters, indices, self.segmentIndexes[i]) self.segmentIndexes[i].addSegment(segment) segments.append(segment) self.tsCount += 1 return segments def grammar_induction(self, segments): """ @description : get grammar from segments. --------- @param : segments -- a list of segments ------- @Returns : ------- """ self.grammar.train_string(segments) self.rule_set = self.grammar.get_rule_set() def get_frequency_matrix(self): """ @description : for each segment, generate their frequencies of which are covered by the same grammar rule. --------- @param : ------- @Returns : a two-dimensional matrix that represents the frequency of each segment covered by certain grammar. ------- """ frequencyMatrix = [] for segmentIndex in self.segmentIndexes: rDict = {} for j in range(0, self.tsCount): segment = segmentIndex.getSegment(j) rule = segment.getRule() if rDict.get(rule) == None: rDict[rule] = 1 else: rDict[rule] = rDict[rule] + 1 rowFrequency = [] for j in range(0, self.tsCount): rule = segmentIndex.getSegment(j).getRule() if rule == self.grammar.root_production: rowFrequency.append(1) else: rowFrequency.append( rDict[segmentIndex.getSegment(j).getRule()]) frequencyMatrix.append(rowFrequency) return frequencyMatrix def cut_window(self, frequencyMatrix): """ @description : generate windows with the frequencyMatrix. The change points of the frequency are the cut lines. --------- @param : frequencyMatrix -- a two-dimensional matrix ------- @Returns : a list of windows ------- """ start = 0 windows = [] for now in range(1, len(frequencyMatrix)): if frequencyMatrix[now] != frequencyMatrix[start]: windows.append(Window(start, now, self.segmentLength)) start = now windows.append(Window(start, len(frequencyMatrix), self.segmentLength)) return windows def generateInitialClusters(self, startIndex, windows): """ @description : generate initial clusters in each window. The clusters are not overlapped by each other but the sum of them covers all the segments. --------- @param : startIndex -- the start number of p time series. windows -- the cut window used to generate clusters ------- @Returns : new windows that each of which contains the generated clusters ------- """ for window in windows: window.initSubsequences(startIndex, self) window.initClusters(self) window.clustersCombination() window.clustersBreakingTie() window.clustersProcessMiss() window.computeAllDistancesAndCentroids() return windows
def setUp(self): # All tests will be run with 6 letter words # and 5 letter alphabet self.sax = SAX(6, 5, 1e-6)
def sax_rep(word, letter, ary): ary = np.asarray(ary) sax = SAX(word, letter) return sax.to_letter_rep(ary)
def min_dist_sax(t1String, t2String, word, alpha, eps=0.000001): s = SAX(word, alpha, eps) return s.compare_strings(t1String, t2String)
def convert_sax(ts,word,alpha,eps=0.000001): s=SAX(word,alpha,eps) (t1String, t1Indices) = s.to_letter_rep(ts) return t1String
def min_dist_sax(t1String,t2String,word,alpha,eps=0.000001): s=SAX(word,alpha,eps) return s.compare_strings(t1String,t2String)
from saxpy import SAX import os with open("data.txt") as f: data = f.readlines() s = SAX(32, 10) for line in data: x = [] for p in line.split(): x.append(float(p)) print s.to_letter_rep(x)[0]
from saxpy import SAX import numpy as np import matplotlib.pyplot as plt t=np.linspace(0,15,num=100) t2=np.linspace(1,16,num=100) data=np.sin(t) data2=np.sin(t2) sax=SAX(wordSize=20) rep=sax.to_letter_rep(data) print(rep) rep2=sax.to_letter_rep(data2) print(rep2) plt.plot(data,'b') plt.plot(data2,'r') plt.show()
def convert_sax(ts, word, alpha, eps=0.000001): s = SAX(word, alpha, eps) (t1String, t1Indices) = s.to_letter_rep(ts) return t1String
def DrawLines(lines): ax = gca() for line in lines: tline = Line2D((line[0], line[2]), (line[1], line[3])) ax.add_line(tline) n, w, a = read_para(sys.argv[1:]) #1 represent SAX and calculate the frequence x = Time_series.Time_series_CAR(n) data = x.tolist() sax = SAX(w, a, 1e-6) (letters, indices) = sax.to_letter_rep(data) frq = sax.symbol_frequency(data) #2 Dimensionality reduction with linear interprolation a = np.asarray(data, dtype=np.float64) newdata = (a + np.random.normal(0, 3, n)).tolist() nordata = normalize(a[:, np.newaxis], axis=0).ravel() figure() lines = WindowSliding.WindowSliding(nordata, Fitting.Fitting, Fitting.SumofSquaredError) DrawPlot(nordata, 'Pecewise linear approximation with Sliding Window') DrawLines(lines) show()
def sax_rep(word,letter,ary): ary = np.asarray(ary) sax = SAX(word,letter) return sax.to_letter_rep(ary)