def test_sdist_new(): a = list(range(5)) b = a.copy() stats = util.calculate_stats(a, b) np.testing.assert_almost_equal(util.sdist_new(a, b, 0, stats), 0) np.random.seed(1337) a = np.random.random(100) stats = util.calculate_stats(a, a) np.testing.assert_almost_equal(util.sdist_new(a[50:], a, 50, stats), 0)
def fast_shapelet_discovery(timeseries, labels, m=None, min_len=1, max_len=None): if m is None: m = np.min([len(x) for x in timeseries]) # Maximum length of a timeserie if max_len is None: max_len = m max_gain, max_gap = 0, 0 best_shapelet, best_distance, best_L = None, None, None cntr = 0 for ts, label in zip(timeseries, labels): print(cntr, '/', len(timeseries)) cntr += 1 x = (ts, label) stats = {} for i, (ts2, label2) in enumerate(zip(timeseries, labels)): stats[i] = util.calculate_stats(ts, ts2) for l in range(min_len, max_len + 1): # Possible shapelet lengths H = [] # Cache/history for i in range(len(ts) - l): # Possible start positions broken = False for (L, S) in H: R = util.sdist(ts[i:i + l], S) if util.upperIG(L, R, timeseries, labels) < max_gain: broken = True break # Continue with next i if not broken: L = [] for k, (ts2, label2) in enumerate(zip(timeseries, labels)): L.append((util.sdist_new(ts[i:i + l], ts2, i, stats[k]), label2)) L = sorted(L, key=lambda x: x[0]) #print(L) best_ig, tau = find_best_split_point(L) if best_ig < max_gain: best_shapelet = ts[i:i + l] max_gain = best_ig best_L = L best_distance = tau # (1.4578175811448, 1), (nan, 0), (nan, 1), (nan, 0), (nan, 1), ... print('---->', max_gain, best_distance) H.append((L, ts[i:i + l])) return best_shapelet, best_distance, best_L, max_gain
def evaluate_z_norm_space(self, time_serie, proba=True): if self.distance is None: if proba: return self.class_probabilities else: return max(self.class_probabilities.items(), key=operator.itemgetter(1))[0] else: stats = calculate_stats(self.shapelet, time_serie) dist = sdist_new(self.shapelet, time_serie, 0, stats) if dist <= self.distance: return self.left.evaluate_z_norm_space(time_serie, proba=proba) else: return self.right.evaluate_z_norm_space(time_serie, proba=proba)
print(tree.shapelet) print(tree.distance) distances = [] for ts, label in zip(timeseries, labels): d, idx = subsequence_dist(ts, tree.shapelet) distances.append((d, label)) print([x for x in sorted(distances, key=lambda x: x[0])]) distances = [] for ts, label in zip(timeseries, labels): stats = calculate_stats(tree.shapelet, ts) d = sdist_new(tree.shapelet, ts, 0, stats) distances.append((d, label)) print([x for x in sorted(distances, key=lambda x: x[0])]) distances = [] for ts, label in zip(timeseries, labels): stats = calculate_stats(tree.right.shapelet, ts) d = sdist_new(tree.right.shapelet, ts, 0, stats) distances.append((d, label)) print([x for x in sorted(distances, key=lambda x: x[0])]) # tree.populate_class_probs(timeseries[:-1], labels[:-1]) # tree.recalculate_distances(timeseries[:-1], labels[:-1])
timeseries, labels = generate_binary_classification_data( typical_characteristic, ts_length, nr_timeserie) for ts, label in tqdm(zip(timeseries, labels)): stats = {} start_time = time.time() for i, (ts2, label2) in enumerate(zip(timeseries, labels)): stats[tuple(ts2)] = util.calculate_stats(ts, ts2) sdist_new_overhead.append(time.time() - start_time) for l in range(1, ts_length + 1): for start in range(len(ts) - l): # Possible start positions new_dists = [] old_dists = [] for k, (ts2, label2) in enumerate(zip(timeseries, labels)): start_time = time.time() dist_new = util.sdist_new(ts[start:start + l], ts2, start, stats[tuple(ts2)]) sdist_new_times.append(time.time() - start_time) new_dists.append((k, dist_new)) start_time = time.time() dist_old, idx_old = util.subsequence_dist( ts2, ts[start:start + l]) sdist_old_times.append(time.time() - start_time) old_dists.append((k, dist_old)) new_dists = sorted(new_dists, key=lambda x: x[1]) old_dists = sorted(old_dists, key=lambda x: x[1]) print(new_dists) print(old_dists) np.testing.assert_equal([x[0] for x in new_dists][0], [x[0] for x in old_dists][0])