def test_label_consistency_w_different_seeds(self): test_cases = [ # seed1, seed2, n_features, label order, expected (0, 1, 5, [0, 1, 2]), (3, 2, 5, [0, 1, 2, 1, 0]), (0, 1, 5, [0, 1, 2, 3, 0, 1, 2, 3]), (0, 9, 5, [0, 1, 1, 0, 2, 2, 2, 0]), ] for s1, s2, n, labels in test_cases: rdata = RandomData(0, n, window_size=5) k = len(set(labels)) # Num clusters t = 200 * k * len(labels) # total ts length breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] X, _ = rdata.generate_points(labels, breaks) ticc1 = TICC(n_clusters=k, window_size=5, n_jobs=4, random_state=s1) ticc2 = TICC(n_clusters=k, window_size=5, n_jobs=4, random_state=s2) y1 = ticc1.fit_predict(X) y2 = ticc2.fit_predict(X) np.testing.assert_array_equal(y1, y2)
def test_pass_if_consistent_on_similar_random_data(self): test_cases = [ (5, [0, 1, 0], 5), (5, [0, 1, 2, 1, 0], 5), ] for n, labels, repeats in test_cases: rdata = RandomData(seed=0, n_features=n, window_size=5) k = len(set(labels)) t = 200 * k * len(labels) breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] rdata.generate_cluster_params(k) # Reuse same cluster parameters for each dataset data = [ rdata.generate_points(labels, breaks, True)[0] for i in range(repeats) ] ticc = TICC(n_clusters=k, window_size=5, beta=300, n_jobs=4, random_state=0, cluster_reassignment=0.3, verbose=True) y_preds = [ticc.fit_predict(X) for X in data] for y1, y2 in combinations(y_preds, 2): result = np.sum(np.not_equal(y1, y2)) / t assert result < 0.02
def test_pass_if_y_size_correct(self): test_cases = [ ([0, 1, 0], [20, 40, 60]), ([0, 1, 0, 1], [20, 40, 500, 600]), ] for seg, b in test_cases: rd = RandomData(window_size=5, n_features=5) _, result = rd.generate_points(seg, b) assert result.shape == (b[-1], )
def test_pass_if_y_labels_correct(self): test_cases = [ ([0, 1, 0], [3, 7, 10], [0, 0, 0, 1, 1, 1, 1, 0, 0, 0]), ([1, 0, 1, 0], [5, 10, 15, 20], [1] * 5 + [0] * 5 + [1] * 5 + [0] * 5), ] for seg, b, expected in test_cases: rd = RandomData(window_size=5, n_features=5) _, y = rd.generate_points(seg, b) assert y.tolist() == expected
def test_pass_if_consistent_with_same_seed(self): test_cases = [ ([0, 1, 0], [20, 40, 60], 0), ([0, 1, 0], [20, 40, 60], 1), ([0, 1, 0], [20, 40, 60], 10), ] for seg, b, seed in test_cases: X1, y1 = RandomData(seed).generate_points(seg, b) X2, y2 = RandomData(seed).generate_points(seg, b) np.testing.assert_array_equal(X1, X2) np.testing.assert_array_equal(y1, y2)
def test_split_theta(self): test_cases = [(5, 8)] for n, w in test_cases: rdata = RandomData(0, n_features=n, window_size=w) cluster = _TICCluster(1) cluster.MRF_ = rdata.block_toeplitz() blocks = cluster.split_theta(w) assert len(blocks) == w for i, B in enumerate(blocks): assert B.shape == (n, n) A = cluster.MRF_[i * n:(i + 1) * n, :n] assert_array_equal(B, A)
def test_pass_if_block_dims_correct(self): test_cases = [ (5, 10), (10, 10), (10, 1), (1, 1), (1, 10), ] for n, w in test_cases: rdata = RandomData(n_features=n, window_size=w) result = rdata.block_toeplitz().shape assert result == (n * w, n * w)
def test_pass_if_generates_diff_points_each_call(self): test_cases = [ ([0, 1, 0], [20, 40, 60], 0), ([0, 1, 0], [20, 40, 60], 10), ([0, 1, 0], [20, 40, 60], 100), ] for seg, b, seed in test_cases: rd = RandomData(seed) X1, y1 = rd.generate_points(seg, b) X2, y2 = rd.generate_points(seg, b) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, X1, X2) # We want different points but same labels! np.testing.assert_array_equal(y1, y2)
def test_fit_predict(self): rdata = RandomData(0, 5, 5) labels = [0, 1, 2, 3, 4, 5] breaks = [(i) * 1200 // len(labels) for i, _ in enumerate(labels, 1)] X, _ = rdata.generate_points(labels, breaks) ticc = TICC(n_clusters=6, window_size=5, beta=0, n_jobs=4, verbose=True, random_state=0) A = ticc.fit(X).predict(X) B = ticc.fit_predict(X) np.testing.assert_array_equal(A, B)
def test_score_increases_with_f1(self): rdata = RandomData(0, 5, 5) samples_per_segment = 300 labels = [0, 1, 2, 3] # , 1, 0, 1] k = len(set(labels)) # Num clusters t = samples_per_segment * k * len(labels) # total ts length breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] X, y_true = rdata.generate_points(labels, breaks) params = ParameterGrid({ 'beta': [0, 500, 1000], 'window_size': [1, 5, 10], }) models = (TICC(**p, random_state=0) for p in params) models = Parallel(n_jobs=-1)(delayed(lambda x: x.fit(X))(m) for m in models) scores = [model.score(X) for model in models] f1_scores = [ best_f1(y_true, y) for y in map(lambda x: x.predict(X), models) ] assert scores.index(max(scores)) == f1_scores.index(max(f1_scores))
def test_fail_if_input_error(self): test_cases = [ ([0, 1], [100, 300, 900]), ([0, 1], [100, 300, 900]), ([0, 1, 3, 5], [100, 300, 450]), ] for seg, b in test_cases: try: RandomData().generate_points(seg, b) except ValueError: assert True
def test_pass_if_positive_definite(self): test_cases = [ (5, 10), (10, 10), (3, 4), (5, 1), ] for n, w in test_cases: # Block Toeplitz Matrix Theta Theta = RandomData(None, n, w).block_toeplitz() assert bool(np.linalg.cholesky(Theta).shape)
def test_matches_original_paper_macro_F1(self): test_cases = [ # n_features, label order, macro f1 to-beat (5, [0, 1, 0], 0.9), (5, [0, 1, 2, 1, 0], 0.9), (5, [0, 1, 2, 3, 0, 1, 2, 3], 0.9), (5, [0, 1, 1, 0, 2, 2, 2, 0], 0.9), ] for n, labels, expected in test_cases: rdata = RandomData(0, n, window_size=5) # Original paper code performs at 100p/cluster! samples_per_segment = 120 k = len(set(labels)) # Num clusters t = samples_per_segment * k * len(labels) # total ts length breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] X, y_tru = rdata.generate_points(labels, breaks) ticc = TICC(n_clusters=k, window_size=5, n_jobs=4, random_state=0) y = ticc.fit_predict(X) # We use best_f1 because label:segment assignments are arbitrary result = best_f1(y_tru, y, average='macro') assert result > expected
def test_pass_if_diags_symmetric(self): test_cases = [ (5, 10), (10, 10), (3, 4), (5, 1), ] for n, w in test_cases: # Block Toeplitz Matrix Theta Theta = RandomData(None, n, w).block_toeplitz() block_chunked = [ # list of rows/cols of block matrix np.split(col, w) for col in np.split(Theta, w, axis=1) ] for i in range(w): A = block_chunked[i][i] np.testing.assert_array_equal(A.T, A)
def test_pass_if_block_toeplitz(self): test_cases = [ (5, 10), (10, 10), (3, 4), (5, 1), ] for n, w in test_cases: # Block Toeplitz Matrix Theta Theta = RandomData(None, n, w).block_toeplitz() # List of w columns of n*n blocks block_chunked = [ # list of rows/cols of block matrix np.split(col, w) for col in np.split(Theta, w, axis=1) ] for i in range(w - 1): for j in range(w - 1): A = block_chunked[i][j] B = block_chunked[i + 1][j + 1] np.testing.assert_array_equal(A, B)
def test_recycling_clusters_between_calls(self): test_cases = [ ([0, 1, 0], [20, 40, 60], 0), ([0, 1, 0], [20, 40, 60], 10), ([0, 1, 0], [20, 40, 60], 100), ] for seg, b, seed in test_cases: rdata = RandomData(seed) rdata.generate_cluster_params(len(set(seg))) X1, y1 = rdata.generate_points(seg, b, True) C1 = rdata.clusters X2, y2 = rdata.generate_points(seg, b, True) C2 = rdata.clusters assert C1 == C2
fig.show() # %% Generate data n_features = 5 label_seq = [0, 1, 2, 0, 2, 1] samples_per_segment = 250 window_size = 8 # Derived from above params k = len(set(label_seq)) # Num clusters t = samples_per_segment * len(label_seq) # total ts length breaks = [i * t // len(label_seq) for i in range(1, len(label_seq) + 1)] palette = {n: c['color'] for n, c in zip(range(n_features), colors)} randomdata = RandomData(seed=1234, n_features=n_features, window_size=window_size) X, y_true = randomdata.generate_points(label_seq, breaks) # Plot Synthetic Data plot_synthetic_data(X, breaks) # %% Fit TICC and GMM to data scaler = StandardScaler() X_scaled = scaler.fit_transform(X) ticc = TICC(n_clusters=k, window_size=window_size, random_state=1234, beta=200) gmm = GaussianMixture(n_components=k, random_state=1234) X_stacked = ticc.stack_data(X_scaled) y_ticc = ticc.fit_predict(X) y_gmm = gmm.fit_predict(X_stacked)