Пример #1
0
 def test_label_consistency_w_different_seeds(self):
     test_cases = [
         # seed1, seed2, n_features, label order, expected
         (0, 1, 5, [0, 1, 2]),
         (3, 2, 5, [0, 1, 2, 1, 0]),
         (0, 1, 5, [0, 1, 2, 3, 0, 1, 2, 3]),
         (0, 9, 5, [0, 1, 1, 0, 2, 2, 2, 0]),
     ]
     for s1, s2, n, labels in test_cases:
         rdata = RandomData(0, n, window_size=5)
         k = len(set(labels))  # Num clusters
         t = 200 * k * len(labels)  # total ts length
         breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)]
         X, _ = rdata.generate_points(labels, breaks)
         ticc1 = TICC(n_clusters=k,
                      window_size=5,
                      n_jobs=4,
                      random_state=s1)
         ticc2 = TICC(n_clusters=k,
                      window_size=5,
                      n_jobs=4,
                      random_state=s2)
         y1 = ticc1.fit_predict(X)
         y2 = ticc2.fit_predict(X)
         np.testing.assert_array_equal(y1, y2)
Пример #2
0
 def test_pass_if_consistent_on_similar_random_data(self):
     test_cases = [
         (5, [0, 1, 0], 5),
         (5, [0, 1, 2, 1, 0], 5),
     ]
     for n, labels, repeats in test_cases:
         rdata = RandomData(seed=0, n_features=n, window_size=5)
         k = len(set(labels))
         t = 200 * k * len(labels)
         breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)]
         rdata.generate_cluster_params(k)
         # Reuse same cluster parameters for each dataset
         data = [
             rdata.generate_points(labels, breaks, True)[0]
             for i in range(repeats)
         ]
         ticc = TICC(n_clusters=k,
                     window_size=5,
                     beta=300,
                     n_jobs=4,
                     random_state=0,
                     cluster_reassignment=0.3,
                     verbose=True)
         y_preds = [ticc.fit_predict(X) for X in data]
         for y1, y2 in combinations(y_preds, 2):
             result = np.sum(np.not_equal(y1, y2)) / t
             assert result < 0.02
Пример #3
0
 def test_pass_if_y_size_correct(self):
     test_cases = [
         ([0, 1, 0], [20, 40, 60]),
         ([0, 1, 0, 1], [20, 40, 500, 600]),
     ]
     for seg, b in test_cases:
         rd = RandomData(window_size=5, n_features=5)
         _, result = rd.generate_points(seg, b)
         assert result.shape == (b[-1], )
Пример #4
0
 def test_pass_if_y_labels_correct(self):
     test_cases = [
         ([0, 1, 0], [3, 7, 10], [0, 0, 0, 1, 1, 1, 1, 0, 0, 0]),
         ([1, 0, 1, 0], [5, 10, 15,
                         20], [1] * 5 + [0] * 5 + [1] * 5 + [0] * 5),
     ]
     for seg, b, expected in test_cases:
         rd = RandomData(window_size=5, n_features=5)
         _, y = rd.generate_points(seg, b)
         assert y.tolist() == expected
Пример #5
0
    def test_pass_if_consistent_with_same_seed(self):
        test_cases = [
            ([0, 1, 0], [20, 40, 60], 0),
            ([0, 1, 0], [20, 40, 60], 1),
            ([0, 1, 0], [20, 40, 60], 10),
        ]
        for seg, b, seed in test_cases:
            X1, y1 = RandomData(seed).generate_points(seg, b)
            X2, y2 = RandomData(seed).generate_points(seg, b)

            np.testing.assert_array_equal(X1, X2)
            np.testing.assert_array_equal(y1, y2)
Пример #6
0
 def test_split_theta(self):
     test_cases = [(5, 8)]
     for n, w in test_cases:
         rdata = RandomData(0, n_features=n, window_size=w)
         cluster = _TICCluster(1)
         cluster.MRF_ = rdata.block_toeplitz()
         blocks = cluster.split_theta(w)
         assert len(blocks) == w
         for i, B in enumerate(blocks):
             assert B.shape == (n, n)
             A = cluster.MRF_[i * n:(i + 1) * n, :n]
             assert_array_equal(B, A)
Пример #7
0
    def test_pass_if_block_dims_correct(self):
        test_cases = [
            (5, 10),
            (10, 10),
            (10, 1),
            (1, 1),
            (1, 10),
        ]

        for n, w in test_cases:
            rdata = RandomData(n_features=n, window_size=w)
            result = rdata.block_toeplitz().shape
            assert result == (n * w, n * w)
Пример #8
0
 def test_pass_if_generates_diff_points_each_call(self):
     test_cases = [
         ([0, 1, 0], [20, 40, 60], 0),
         ([0, 1, 0], [20, 40, 60], 10),
         ([0, 1, 0], [20, 40, 60], 100),
     ]
     for seg, b, seed in test_cases:
         rd = RandomData(seed)
         X1, y1 = rd.generate_points(seg, b)
         X2, y2 = rd.generate_points(seg, b)
         np.testing.assert_raises(AssertionError,
                                  np.testing.assert_array_equal, X1, X2)
         # We want different points but same labels!
         np.testing.assert_array_equal(y1, y2)
Пример #9
0
 def test_fit_predict(self):
     rdata = RandomData(0, 5, 5)
     labels = [0, 1, 2, 3, 4, 5]
     breaks = [(i) * 1200 // len(labels) for i, _ in enumerate(labels, 1)]
     X, _ = rdata.generate_points(labels, breaks)
     ticc = TICC(n_clusters=6,
                 window_size=5,
                 beta=0,
                 n_jobs=4,
                 verbose=True,
                 random_state=0)
     A = ticc.fit(X).predict(X)
     B = ticc.fit_predict(X)
     np.testing.assert_array_equal(A, B)
Пример #10
0
 def test_score_increases_with_f1(self):
     rdata = RandomData(0, 5, 5)
     samples_per_segment = 300
     labels = [0, 1, 2, 3]  # , 1, 0, 1]
     k = len(set(labels))  # Num clusters
     t = samples_per_segment * k * len(labels)  # total ts length
     breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)]
     X, y_true = rdata.generate_points(labels, breaks)
     params = ParameterGrid({
         'beta': [0, 500, 1000],
         'window_size': [1, 5, 10],
     })
     models = (TICC(**p, random_state=0) for p in params)
     models = Parallel(n_jobs=-1)(delayed(lambda x: x.fit(X))(m)
                                  for m in models)
     scores = [model.score(X) for model in models]
     f1_scores = [
         best_f1(y_true, y) for y in map(lambda x: x.predict(X), models)
     ]
     assert scores.index(max(scores)) == f1_scores.index(max(f1_scores))
Пример #11
0
 def test_fail_if_input_error(self):
     test_cases = [
         ([0, 1], [100, 300, 900]),
         ([0, 1], [100, 300, 900]),
         ([0, 1, 3, 5], [100, 300, 450]),
     ]
     for seg, b in test_cases:
         try:
             RandomData().generate_points(seg, b)
         except ValueError:
             assert True
Пример #12
0
 def test_pass_if_positive_definite(self):
     test_cases = [
         (5, 10),
         (10, 10),
         (3, 4),
         (5, 1),
     ]
     for n, w in test_cases:
         # Block Toeplitz Matrix Theta
         Theta = RandomData(None, n, w).block_toeplitz()
         assert bool(np.linalg.cholesky(Theta).shape)
Пример #13
0
 def test_matches_original_paper_macro_F1(self):
     test_cases = [
         # n_features, label order, macro f1 to-beat
         (5, [0, 1, 0], 0.9),
         (5, [0, 1, 2, 1, 0], 0.9),
         (5, [0, 1, 2, 3, 0, 1, 2, 3], 0.9),
         (5, [0, 1, 1, 0, 2, 2, 2, 0], 0.9),
     ]
     for n, labels, expected in test_cases:
         rdata = RandomData(0, n, window_size=5)
         # Original paper code performs at 100p/cluster!
         samples_per_segment = 120
         k = len(set(labels))  # Num clusters
         t = samples_per_segment * k * len(labels)  # total ts length
         breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)]
         X, y_tru = rdata.generate_points(labels, breaks)
         ticc = TICC(n_clusters=k, window_size=5, n_jobs=4, random_state=0)
         y = ticc.fit_predict(X)
         # We use best_f1 because label:segment assignments are arbitrary
         result = best_f1(y_tru, y, average='macro')
         assert result > expected
Пример #14
0
 def test_pass_if_diags_symmetric(self):
     test_cases = [
         (5, 10),
         (10, 10),
         (3, 4),
         (5, 1),
     ]
     for n, w in test_cases:
         # Block Toeplitz Matrix Theta
         Theta = RandomData(None, n, w).block_toeplitz()
         block_chunked = [  # list of rows/cols of block matrix
             np.split(col, w) for col in np.split(Theta, w, axis=1)
         ]
         for i in range(w):
             A = block_chunked[i][i]
             np.testing.assert_array_equal(A.T, A)
Пример #15
0
 def test_pass_if_block_toeplitz(self):
     test_cases = [
         (5, 10),
         (10, 10),
         (3, 4),
         (5, 1),
     ]
     for n, w in test_cases:
         # Block Toeplitz Matrix Theta
         Theta = RandomData(None, n, w).block_toeplitz()
         # List of w columns of n*n blocks
         block_chunked = [  # list of rows/cols of block matrix
             np.split(col, w) for col in np.split(Theta, w, axis=1)
         ]
         for i in range(w - 1):
             for j in range(w - 1):
                 A = block_chunked[i][j]
                 B = block_chunked[i + 1][j + 1]
                 np.testing.assert_array_equal(A, B)
Пример #16
0
 def test_recycling_clusters_between_calls(self):
     test_cases = [
         ([0, 1, 0], [20, 40, 60], 0),
         ([0, 1, 0], [20, 40, 60], 10),
         ([0, 1, 0], [20, 40, 60], 100),
     ]
     for seg, b, seed in test_cases:
         rdata = RandomData(seed)
         rdata.generate_cluster_params(len(set(seg)))
         X1, y1 = rdata.generate_points(seg, b, True)
         C1 = rdata.clusters
         X2, y2 = rdata.generate_points(seg, b, True)
         C2 = rdata.clusters
         assert C1 == C2
Пример #17
0
    fig.show()


# %% Generate data
n_features = 5
label_seq = [0, 1, 2, 0, 2, 1]
samples_per_segment = 250
window_size = 8

# Derived from above params
k = len(set(label_seq))  # Num clusters
t = samples_per_segment * len(label_seq)  # total ts length
breaks = [i * t // len(label_seq) for i in range(1, len(label_seq) + 1)]
palette = {n: c['color'] for n, c in zip(range(n_features), colors)}
randomdata = RandomData(seed=1234,
                        n_features=n_features,
                        window_size=window_size)
X, y_true = randomdata.generate_points(label_seq, breaks)

# Plot Synthetic Data
plot_synthetic_data(X, breaks)

# %% Fit TICC and GMM to data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
ticc = TICC(n_clusters=k, window_size=window_size, random_state=1234, beta=200)
gmm = GaussianMixture(n_components=k, random_state=1234)
X_stacked = ticc.stack_data(X_scaled)

y_ticc = ticc.fit_predict(X)
y_gmm = gmm.fit_predict(X_stacked)