def fit(X, y, max_depth=None, min_samples_split=2, min_len=10, max_len=10): print('New node') if (max_depth is None or max_depth > 0) and len(X) > min_samples_split and len(np.unique(y)) > 1: # TODO: pass the distance along with this shapelet so we don't need to recalculate this! shapelet = fast_shapelet_discovery(X, y,min_len=min_len,max_len=max_len) distance = check_candidate(X, y, shapelet)[1] node = ShapeletTree(right=None, left=None, shapelet=shapelet, distance=distance, class_probabilities=Counter(y)) X_left, y_left, X_right, y_right = [], [], [], [] for ts, label in zip(X, y): if subsequence_dist(ts, shapelet)[0] <= distance: X_left.append(ts) y_left.append(label) else: X_right.append(ts) y_right.append(label) new_depth = None if max_depth is None else max_depth - 1 node.left = fit(X_left, y_left, max_depth=new_depth, min_samples_split=min_samples_split, min_len=min_len, max_len=max_len) node.right = fit(X_right, y_right, max_depth=new_depth, min_samples_split=min_samples_split, min_len=min_len, max_len=max_len) return node else: return ShapeletTree(right=None, left=None, shapelet=None, distance=None, class_probabilities=Counter(y))
def increment_class_probs(self, ts, label): if label not in self.class_probabilities: self.class_probabilities[label] = 1 else: self.class_probabilities[label] += 1 if self.distance is not None: dist, idx = subsequence_dist(ts, self.shapelet) if dist <= self.distance: self.left.increment_class_probs(ts, label) else: self.right.increment_class_probs(ts, label)
def evaluate(self, time_serie, proba=True): if self.distance is None: if proba: return self.class_probabilities else: return max(self.class_probabilities.items(), key=operator.itemgetter(1))[0] else: dist, idx = subsequence_dist(time_serie, self.shapelet) if dist <= self.distance: return self.left.evaluate(time_serie, proba=proba) else: return self.right.evaluate(time_serie, proba=proba)
def check_candidate(timeseries, labels, shapelet, min_prune_length=20, best_ig=None): distances = [] # cntr = Counter(labels) for time_serie, label in zip(timeseries, labels): d, idx = util.subsequence_dist(time_serie, shapelet) distances.append((d, label)) # max_ig = None # if best_ig is not None: # cntr[label] -= 1 # if len(distances) > min_prune_length: # max_ig = entropy_pre_prune(cntr, distances) # if max_ig is not None and max_ig <= best_ig: # return 0, 0 return find_best_split_point(sorted(distances, key=lambda x: x[0]))
def recalculate_distances(self, timeseries, labels): if self.distance is not None: ig, dist = check_candidate(timeseries, labels, self.shapelet) print(dist, self.distance) self.distance = dist ts_left, labels_left = [], [] ts_right, labels_right = [], [] for (ts, label) in zip(timeseries, labels): dist, idx = subsequence_dist(ts, self.shapelet) if dist < self.distance: ts_left.append(ts) labels_left.append(label) else: ts_right.append(ts) labels_right.append(label) print(labels, 'are split into', labels_left, 'and', labels_right) self.left.recalculate_distances(ts_left, labels_left) self.right.recalculate_distances(ts_right, labels_right) else: print('leaf:', labels)
# assert np.array_equal(m_uv, m_uv_old) print(labels) print(timeseries) if __name__ == "__main__": print('Fitting tree') tree = extract_shapelet(timeseries, labels) print(tree.shapelet) print(tree.distance) distances = [] for ts, label in zip(timeseries, labels): d, idx = subsequence_dist(ts, tree.shapelet) distances.append((d, label)) print([x for x in sorted(distances, key=lambda x: x[0])]) distances = [] for ts, label in zip(timeseries, labels): stats = calculate_stats(tree.shapelet, ts) d = sdist_new(tree.shapelet, ts, 0, stats) distances.append((d, label)) print([x for x in sorted(distances, key=lambda x: x[0])]) distances = []
stats[tuple(ts2)] = util.calculate_stats(ts, ts2) sdist_new_overhead.append(time.time() - start_time) for l in range(1, ts_length + 1): for start in range(len(ts) - l): # Possible start positions new_dists = [] old_dists = [] for k, (ts2, label2) in enumerate(zip(timeseries, labels)): start_time = time.time() dist_new = util.sdist_new(ts[start:start + l], ts2, start, stats[tuple(ts2)]) sdist_new_times.append(time.time() - start_time) new_dists.append((k, dist_new)) start_time = time.time() dist_old, idx_old = util.subsequence_dist( ts2, ts[start:start + l]) sdist_old_times.append(time.time() - start_time) old_dists.append((k, dist_old)) new_dists = sorted(new_dists, key=lambda x: x[1]) old_dists = sorted(old_dists, key=lambda x: x[1]) print(new_dists) print(old_dists) np.testing.assert_equal([x[0] for x in new_dists][0], [x[0] for x in old_dists][0]) print('New distance calculation took:', np.sum(sdist_new_times) + np.sum(sdist_new_overhead)) print('New distance (overhead):', np.sum(sdist_new_overhead)) print('Old distance calculation took:', np.sum(sdist_old_times))