def test_cnn_core_samples_toy_2(algorithm): X = [ [0, 0], # 0 [1, 1], # 1 [1, 0], # 2 [0, -1], # 3 [0.5, -0.5], # 4 [2, 1.5], # 5 [2.5, -0.5], # 6 [4, 2], # 7 [4.5, 2.5], # 8 [5, -1], # 9 [5.5, -0.5], # 10 [5.5, -1.5], ] # 11 labels = commonnn(X, algorithm=algorithm, eps=1.5, min_samples=0) assert_array_equal(labels, [0, 0, 0, 0, 0, 0, -1, 1, 1, 2, 2, 2]) labels = commonnn(X, algorithm=algorithm, eps=1.5, min_samples=1) assert_array_equal(labels, [0, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1]) labels = commonnn(X, algorithm=algorithm, eps=1.5, min_samples=2) assert_array_equal(labels, [0, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1]) labels = commonnn(X, algorithm=algorithm, eps=1.5, min_samples=3) assert_array_equal(labels, [0, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1]) labels = commonnn(X, algorithm=algorithm, eps=1.5, min_samples=4) assert_array_equal(labels, [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
def test_boundaries(): # ensure min_samples is inclusive of core point core = np.where(commonnn([[0], [1]], eps=2, min_samples=0) >= 0)[0] assert 0 in core # ensure eps is inclusive of circumference core = np.where(commonnn([[0], [1], [1]], eps=1, min_samples=0) >= 0)[0] assert 0 in core core = np.where(commonnn([[0], [1], [1]], eps=0.99, min_samples=0) >= 0)[0] assert 0 not in core
def test_cnn_input_not_modified(use_sparse, metric): # test that the input is not modified by cnn X = np.random.RandomState(0).rand(10, 10) X = sparse.csr_matrix(X) if use_sparse else X X_copy = X.copy() commonnn(X, metric=metric) if use_sparse: assert_array_equal(X.toarray(), X_copy.toarray()) else: assert_array_equal(X, X_copy)
def test_cnn_sparse_precomputed(include_self): D = pairwise_distances(X) nn = NearestNeighbors(radius=0.9).fit(X) X_ = X if include_self else None D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance") # Ensure it is sparse not merely on diagonals: assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) labels_sparse = commonnn(D_sparse, eps=0.8, min_samples=5, metric="precomputed") labels_dense = commonnn(D, eps=0.8, min_samples=5, metric="precomputed") assert_array_equal(labels_dense, labels_sparse)
def test_cnn_sparse_precomputed_different_eps(): # test that precomputed neighbors graph is filtered if computed with # a radius larger than eps. lower_eps = 0.2 nn = NearestNeighbors(radius=lower_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode="distance") cnn_lower = commonnn(D_sparse, eps=lower_eps, metric="precomputed") higher_eps = lower_eps + 0.7 nn = NearestNeighbors(radius=higher_eps).fit(X) D_sparse = nn.radius_neighbors_graph(X, mode="distance") cnn_higher = commonnn(D_sparse, eps=lower_eps, metric="precomputed") assert_array_equal(cnn_lower, cnn_higher)
def test_cnn_core_samples_toy_1(algorithm): X = [[0], [2], [3], [4], [6], [8], [10]] # Within eps = 1, only points at 2, 3, and 4 # are neighbours. Valid clusters need to have more than one # members, so all other points are isolated and considered # noise. labels = commonnn(X, algorithm=algorithm, eps=1, min_samples=0) assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) # With eps=1 and min_samples=1 the 3 samples from the # denser area are no core samples anymore (2 and 4 share 3 as # common neighbour but are not neighbours of each other) labels = commonnn(X, algorithm=algorithm, eps=1, min_samples=1) assert_array_equal(labels, [-1, -1, -1, -1, -1, -1, -1])
def test_commonnn_similarity(): # Tests the algorithm with a similarity array. # Parameters chosen specifically for this task. eps = 0.15 min_samples = 5 # Compute similarities D = distance.squareform(distance.pdist(X)) D /= np.max(D) # Compute labels = commonnn(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0) assert n_clusters_1 == n_clusters cobj = CommonNNClustering(metric="precomputed", eps=eps, min_samples=min_samples) labels = cobj.fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters
def test_cnn_callable(): # Tests the algorithm with a callable metric. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 5 # metric is the function reference, not the string key. metric = distance.euclidean # Compute # parameters chosen for task labels = commonnn( X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree", ) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters cobj = CommonNNClustering(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = cobj.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters
def test_cnn_balltree(): # Tests the algorithm with balltree for neighbor calculation. eps = 0.8 min_samples = 5 D = pairwise_distances(X) labels = commonnn(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters cobj = CommonNNClustering(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = cobj.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters cobj = CommonNNClustering(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree") labels = cobj.fit(X).labels_ n_clusters_3 = len(set(labels)) - int(-1 in labels) assert n_clusters_3 == n_clusters cobj = CommonNNClustering(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = cobj.fit(X).labels_ n_clusters_4 = len(set(labels)) - int(-1 in labels) assert n_clusters_4 == n_clusters cobj = CommonNNClustering(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = cobj.fit(X).labels_ n_clusters_5 = len(set(labels)) - int(-1 in labels) assert n_clusters_5 == n_clusters
def test_cnn_feature(): # Tests the algorithm with a feature vector array. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 5 metric = "euclidean" # Compute # parameters chosen for task labels = commonnn(X, metric=metric, eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters cobj = CommonNNClustering(metric=metric, eps=eps, min_samples=min_samples) labels = cobj.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters
def test_cnn_sparse(): labels_sparse = commonnn(sparse.lil_matrix(X), eps=0.8, min_samples=5) labels_dense = commonnn(X, eps=0.8, min_samples=5) assert_array_equal(labels_dense, labels_sparse)
def test_weighted_cnn(): # ensure sample_weight is validated with pytest.raises(ValueError): commonnn([[0], [1]], sample_weight=[2]) with pytest.raises(ValueError): commonnn([[0], [1]], sample_weight=[2, 3, 4]) # ensure sample_weight has an effect assert_array_equal([], commonnn([[0], [1]], sample_weight=None, min_samples=6)[0]) assert_array_equal([], commonnn([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]) assert_array_equal([0], commonnn([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]) assert_array_equal([0, 1], commonnn([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]) # points within eps of each other: assert_array_equal( [0, 1], commonnn([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0], ) # and effect of non-positive and non-integer sample_weight: assert_array_equal( [], commonnn([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0], ) assert_array_equal( [0, 1], commonnn([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0], ) assert_array_equal( [0, 1], commonnn([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0], ) assert_array_equal( [], commonnn([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0], ) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) sample_weight = rng.randint(0, 5, X.shape[0]) label1 = commonnn(X, sample_weight=sample_weight) assert len(label1) == len(X) X_repeated = np.repeat(X, sample_weight, axis=0) label_repeated = commonnn(X_repeated) core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) core_repeated_mask[np.where(label_repeated >= 0)[0]] = True core_mask = np.zeros(X.shape[0], dtype=bool) core_mask[np.where(label1 >= 0)[0]] = True assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) # sample_weight should work with precomputed distance matrix D = pairwise_distances(X) core3, label3 = commonnn(D, sample_weight=sample_weight, metric="precomputed") assert_array_equal(label1, label3) # sample_weight should work with estimator est = CommonNNClustering().fit(X, sample_weight=sample_weight) label4 = est.labels_ assert_array_equal(label1, label4) est = CommonNNClustering() label5 = est.fit_predict(X, sample_weight=sample_weight) assert_array_equal(label1, label5) assert_array_equal(label1, est.labels_)
def test_cnn_badargs(args): # Test bad argument values: these should all raise ValueErrors with pytest.raises(ValueError): commonnn(X, **args)