예제 #1
0
def test_sdist_new():
    a = list(range(5))
    b = a.copy()
    stats = util.calculate_stats(a, b)
    np.testing.assert_almost_equal(util.sdist_new(a, b, 0, stats), 0)


    np.random.seed(1337)
    a = np.random.random(100)
    stats = util.calculate_stats(a, a)
    np.testing.assert_almost_equal(util.sdist_new(a[50:], a, 50, stats), 0)
예제 #2
0
def fast_shapelet_discovery(timeseries,
                            labels,
                            m=None,
                            min_len=1,
                            max_len=None):
    if m is None:
        m = np.min([len(x)
                    for x in timeseries])  # Maximum length of a timeserie
    if max_len is None:
        max_len = m
    max_gain, max_gap = 0, 0
    best_shapelet, best_distance, best_L = None, None, None
    cntr = 0
    for ts, label in zip(timeseries, labels):
        print(cntr, '/', len(timeseries))
        cntr += 1

        x = (ts, label)
        stats = {}
        for i, (ts2, label2) in enumerate(zip(timeseries, labels)):
            stats[i] = util.calculate_stats(ts, ts2)

        for l in range(min_len, max_len + 1):  # Possible shapelet lengths
            H = []  # Cache/history
            for i in range(len(ts) - l):  # Possible start positions
                broken = False
                for (L, S) in H:
                    R = util.sdist(ts[i:i + l], S)
                    if util.upperIG(L, R, timeseries, labels) < max_gain:
                        broken = True
                        break  # Continue with next i

                if not broken:
                    L = []
                    for k, (ts2, label2) in enumerate(zip(timeseries, labels)):
                        L.append((util.sdist_new(ts[i:i + l], ts2, i,
                                                 stats[k]), label2))
                    L = sorted(L, key=lambda x: x[0])
                    #print(L)
                    best_ig, tau = find_best_split_point(L)
                    if best_ig < max_gain:
                        best_shapelet = ts[i:i + l]
                        max_gain = best_ig
                        best_L = L
                        best_distance = tau
                        # (1.4578175811448, 1), (nan, 0), (nan, 1), (nan, 0), (nan, 1), ...
                        print('---->', max_gain, best_distance)
                    H.append((L, ts[i:i + l]))

    return best_shapelet, best_distance, best_L, max_gain
예제 #3
0
 def evaluate_z_norm_space(self, time_serie, proba=True):
     if self.distance is None:
         if proba:
             return self.class_probabilities
         else:
             return max(self.class_probabilities.items(),
                        key=operator.itemgetter(1))[0]
     else:
         stats = calculate_stats(self.shapelet, time_serie)
         dist = sdist_new(self.shapelet, time_serie, 0, stats)
         if dist <= self.distance:
             return self.left.evaluate_z_norm_space(time_serie, proba=proba)
         else:
             return self.right.evaluate_z_norm_space(time_serie,
                                                     proba=proba)
예제 #4
0
파일: test.py 프로젝트: jingzbu/pyShapelets
    print(tree.shapelet)
    print(tree.distance)

    distances = []

    for ts, label in zip(timeseries, labels):
        d, idx = subsequence_dist(ts, tree.shapelet)
        distances.append((d, label))

    print([x for x in sorted(distances, key=lambda x: x[0])])

    distances = []

    for ts, label in zip(timeseries, labels):
        stats = calculate_stats(tree.shapelet, ts)
        d = sdist_new(tree.shapelet, ts, 0, stats)
        distances.append((d, label))

    print([x for x in sorted(distances, key=lambda x: x[0])])

    distances = []

    for ts, label in zip(timeseries, labels):
        stats = calculate_stats(tree.right.shapelet, ts)
        d = sdist_new(tree.right.shapelet, ts, 0, stats)
        distances.append((d, label))

    print([x for x in sorted(distances, key=lambda x: x[0])])

    # tree.populate_class_probs(timeseries[:-1], labels[:-1])
    # tree.recalculate_distances(timeseries[:-1], labels[:-1])
        timeseries, labels = generate_binary_classification_data(
            typical_characteristic, ts_length, nr_timeserie)
        for ts, label in tqdm(zip(timeseries, labels)):
            stats = {}
            start_time = time.time()
            for i, (ts2, label2) in enumerate(zip(timeseries, labels)):
                stats[tuple(ts2)] = util.calculate_stats(ts, ts2)
            sdist_new_overhead.append(time.time() - start_time)

            for l in range(1, ts_length + 1):
                for start in range(len(ts) - l):  # Possible start positions
                    new_dists = []
                    old_dists = []
                    for k, (ts2, label2) in enumerate(zip(timeseries, labels)):
                        start_time = time.time()
                        dist_new = util.sdist_new(ts[start:start + l], ts2,
                                                  start, stats[tuple(ts2)])
                        sdist_new_times.append(time.time() - start_time)
                        new_dists.append((k, dist_new))

                        start_time = time.time()
                        dist_old, idx_old = util.subsequence_dist(
                            ts2, ts[start:start + l])
                        sdist_old_times.append(time.time() - start_time)
                        old_dists.append((k, dist_old))

                    new_dists = sorted(new_dists, key=lambda x: x[1])
                    old_dists = sorted(old_dists, key=lambda x: x[1])
                    print(new_dists)
                    print(old_dists)
                    np.testing.assert_equal([x[0] for x in new_dists][0],
                                            [x[0] for x in old_dists][0])