def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) data = random_state.randn(n_samples, n_features) distances = pairwise_distances(data) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) n_neighbors = n_samples - 1 distances_csr = NearestNeighbors().fit(data).kneighbors_graph( n_neighbors=n_neighbors, mode='distance') P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0) kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0) P = squareform(P) P_bh = P_bh.toarray() assert_array_almost_equal(P_bh, P, decimal=5) assert_almost_equal(kl_exact, kl_bh, decimal=3)
def test_trustworthiness_not_euclidean_metric(): # Test trustworthiness with a metric different from 'euclidean' and # 'precomputed' random_state = check_random_state(0) X = random_state.randn(100, 2) assert (trustworthiness(X, X, metric='cosine') == trustworthiness( pairwise_distances(X, metric='cosine'), X, metric='precomputed'))
def _run_answer_test(pos_input, pos_output, neighbors, grad_output, verbose=False, perplexity=0.1, skip_num_points=0): distances = pairwise_distances(pos_input).astype(np.float32) args = distances, perplexity, verbose pos_output = pos_output.astype(np.float32) neighbors = neighbors.astype(np.int64, copy=False) pij_input = _joint_probabilities(*args) pij_input = squareform(pij_input).astype(np.float32) grad_bh = np.zeros(pos_output.shape, dtype=np.float32) from scipy.sparse import csr_matrix P = csr_matrix(pij_input) neighbors = P.indices.astype(np.int64) indptr = P.indptr.astype(np.int64) _barnes_hut_tsne.gradient(P.data, pos_output, neighbors, indptr, grad_bh, 0.5, 2, 1, skip_num_points=0) assert_array_almost_equal(grad_bh, grad_output, decimal=4)
def test_knn_imputer_distance_weighted_not_enough_neighbors( na, working_memory): X = np.array([[3, na], [2, na], [na, 4], [5, 6], [6, 8], [na, 5]]) dist = pairwise_distances(X, metric="nan_euclidean", squared=False, missing_values=na) X_01 = np.average(X[3:5, 1], weights=1 / dist[0, 3:5]) X_11 = np.average(X[3:5, 1], weights=1 / dist[1, 3:5]) X_20 = np.average(X[3:5, 0], weights=1 / dist[2, 3:5]) X_50 = np.average(X[3:5, 0], weights=1 / dist[5, 3:5]) X_expected = np.array([[3, X_01], [2, X_11], [X_20, 4], [5, 6], [6, 8], [X_50, 5]]) with config_context(working_memory=working_memory): knn_3 = KNNImputer(missing_values=na, n_neighbors=3, weights='distance') assert_allclose(knn_3.fit_transform(X), X_expected) knn_4 = KNNImputer(missing_values=na, n_neighbors=4, weights='distance') assert_allclose(knn_4.fit_transform(X), X_expected)
def test_precomputed_dists(): redX = X[::2] dists = pairwise_distances(redX, metric='euclidean') clust1 = OPTICS(min_samples=10, algorithm='brute', metric='precomputed').fit(dists) clust2 = OPTICS(min_samples=10, algorithm='brute', metric='euclidean').fit(redX) assert_allclose(clust1.reachability_, clust2.reachability_) assert_array_equal(clust1.labels_, clust2.labels_)
def test_binary_search(): # Test if the binary search finds Gaussians with desired perplexity. random_state = check_random_state(0) data = random_state.randn(50, 5) distances = pairwise_distances(data).astype(np.float32) desired_perplexity = 25.0 P = _binary_search_perplexity(distances, desired_perplexity, verbose=0) P = np.maximum(P, np.finfo(np.double).eps) mean_perplexity = np.mean( [np.exp(-np.sum(P[i] * np.log(P[i]))) for i in range(P.shape[0])]) assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
def test_small_distance_threshold(): rng = np.random.RandomState(0) n_samples = 10 X = rng.randint(-300, 300, size=(n_samples, 3)) # this should result in all data in their own clusters, given that # their pairwise distances are bigger than .1 (which may not be the case # with a different random seed). clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=1., linkage="single").fit(X) # check that the pairwise distances are indeed all larger than .1 all_distances = pairwise_distances(X, metric='minkowski', p=2) np.fill_diagonal(all_distances, np.inf) assert np.all(all_distances > .1) assert clustering.n_clusters_ == n_samples
def test_dbscan_sparse_precomputed(include_self): D = pairwise_distances(X) nn = NearestNeighbors(radius=.9).fit(X) X_ = X if include_self else None D_sparse = nn.radius_neighbors_graph(X=X_, mode='distance') # Ensure it is sparse not merely on diagonals: assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) core_sparse, labels_sparse = dbscan(D_sparse, eps=.8, min_samples=10, metric='precomputed') core_dense, labels_dense = dbscan(D, eps=.8, min_samples=10, metric='precomputed') assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_sparse_precomputed_distance(): """Make sure that TSNE works identically for sparse and dense matrix""" random_state = check_random_state(0) X = random_state.randn(100, 2) D_sparse = kneighbors_graph(X, n_neighbors=100, mode='distance', include_self=True) D = pairwise_distances(X) assert sp.issparse(D_sparse) assert_almost_equal(D_sparse.A, D) tsne = TSNE(metric="precomputed", random_state=0) Xt_dense = tsne.fit_transform(D) for fmt in ['csr', 'lil']: Xt_sparse = tsne.fit_transform(D_sparse.asformat(fmt)) assert_almost_equal(Xt_dense, Xt_sparse)
def test_binary_search_neighbors(): # Binary perplexity search approximation. # Should be approximately equal to the slow method when we use # all points as neighbors. n_samples = 200 desired_perplexity = 25.0 random_state = check_random_state(0) data = random_state.randn(n_samples, 2).astype(np.float32, copy=False) distances = pairwise_distances(data) P1 = _binary_search_perplexity(distances, desired_perplexity, verbose=0) # Test that when we use all the neighbors the results are identical n_neighbors = n_samples - 1 nn = NearestNeighbors().fit(data) distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode='distance') distances_nn = distance_graph.data.astype(np.float32, copy=False) distances_nn = distances_nn.reshape(n_samples, n_neighbors) P2 = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0) indptr = distance_graph.indptr P1_nn = np.array([ P1[k, distance_graph.indices[indptr[k]:indptr[k + 1]]] for k in range(n_samples) ]) assert_array_almost_equal(P1_nn, P2, decimal=4) # Test that the highest P_ij are the same when fewer neighbors are used for k in np.linspace(150, n_samples - 1, 5): k = int(k) topn = k * 10 # check the top 10 * k entries out of k * k entries distance_graph = nn.kneighbors_graph(n_neighbors=k, mode='distance') distances_nn = distance_graph.data.astype(np.float32, copy=False) distances_nn = distances_nn.reshape(n_samples, k) P2k = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0) assert_array_almost_equal(P1_nn, P2, decimal=2) idx = np.argsort(P1.ravel())[::-1] P1top = P1.ravel()[idx][:topn] idx = np.argsort(P2k.ravel())[::-1] P2top = P2k.ravel()[idx][:topn] assert_array_almost_equal(P1top, P2top, decimal=2)
def test_dbscan_balltree(): # Tests the DBSCAN algorithm with balltree for neighbor calculation. eps = 0.8 min_samples = 10 D = pairwise_distances(X) core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree') labels = db.fit(X).labels_ n_clusters_3 = len(set(labels)) - int(-1 in labels) assert n_clusters_3 == n_clusters db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_4 = len(set(labels)) - int(-1 in labels) assert n_clusters_4 == n_clusters db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_5 = len(set(labels)) - int(-1 in labels) assert n_clusters_5 == n_clusters
def test_cluster_distances_with_distance_threshold(): rng = np.random.RandomState(0) n_samples = 100 X = rng.randint(-10, 10, size=(n_samples, 3)) # check the distances within the clusters and with other clusters distance_threshold = 4 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, linkage="single").fit(X) labels = clustering.labels_ D = pairwise_distances(X, metric="minkowski", p=2) # to avoid taking the 0 diagonal in min() np.fill_diagonal(D, np.inf) for label in np.unique(labels): in_cluster_mask = labels == label max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask] .min(axis=0).max()) min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask] .min(axis=0).min()) # single data point clusters only have that inf diagonal here if in_cluster_mask.sum() > 1: assert max_in_cluster_distance < distance_threshold assert min_out_cluster_distance >= distance_threshold
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average", "single"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert np.size(np.unique(labels)) == 10 finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert np.size(np.unique(clustering.labels_)) == 10 # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix( connectivity.toarray()[:10, :10]), linkage=linkage) with pytest.raises(ValueError): clustering.fit(X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") with pytest.raises(ValueError): clustering.fit(X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
def test_weighted_dbscan(): # ensure sample_weight is validated with pytest.raises(ValueError): dbscan([[0], [1]], sample_weight=[2]) with pytest.raises(ValueError): dbscan([[0], [1]], sample_weight=[2, 3, 4]) # ensure sample_weight has an effect assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]) assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]) # points within eps of each other: assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]) # and effect of non-positive and non-integer sample_weight: assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) sample_weight = rng.randint(0, 5, X.shape[0]) core1, label1 = dbscan(X, sample_weight=sample_weight) assert len(label1) == len(X) X_repeated = np.repeat(X, sample_weight, axis=0) core_repeated, label_repeated = dbscan(X_repeated) core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) core_repeated_mask[core_repeated] = True core_mask = np.zeros(X.shape[0], dtype=bool) core_mask[core1] = True assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) # sample_weight should work with precomputed distance matrix D = pairwise_distances(X) core3, label3 = dbscan(D, sample_weight=sample_weight, metric='precomputed') assert_array_equal(core1, core3) assert_array_equal(label1, label3) # sample_weight should work with estimator est = DBSCAN().fit(X, sample_weight=sample_weight) core4 = est.core_sample_indices_ label4 = est.labels_ assert_array_equal(core1, core4) assert_array_equal(label1, label4) est = DBSCAN() label5 = est.fit_predict(X, sample_weight=sample_weight) core5 = est.core_sample_indices_ assert_array_equal(core1, core5) assert_array_equal(label1, label5) assert_array_equal(label1, est.labels_)
def test_knn_imputer_weight_distance(na): X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) # Test with "distance" weight nn = KNeighborsRegressor(metric="euclidean", weights="distance") X_rows_idx = [0, 2, 3, 4, 5, 6] nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0]) knn_imputed_value = nn.predict(X[1:2, 1:])[0] # Manual calculation X_neighbors_idx = [0, 2, 3, 4, 5] dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na) weights = 1 / dist[:, X_neighbors_idx].ravel() manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights) X_imputed_distance1 = np.array([[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) # NearestNeighbor calculation X_imputed_distance2 = np.array([[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed_distance1) assert_allclose(imputer.fit_transform(X), X_imputed_distance2) # Test with weights = "distance" and n_neighbors=2 X = np.array([ [na, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) # neighbors are rows 1, 2, the nan_euclidean_distances are: dist_0_1 = np.sqrt((3 / 2) * ((1 - 0)**2 + (2 - 0)**2)) dist_0_2 = np.sqrt((3 / 2) * ((2 - 0)**2 + (3 - 0)**2)) imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2]) X_imputed = np.array([ [imputed_value, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) # Test with varying missingness patterns X = np.array([ [1, 0, 0, 1], [0, na, 1, na], [1, 1, 1, na], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) # Get weights of donor neighbors dist = nan_euclidean_distances(X, missing_values=na) r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]] r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]] r1c1_nbor_wt = 1 / r1c1_nbor_dists r1c3_nbor_wt = 1 / r1c3_nbor_dists r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]] r2c3_nbor_wt = 1 / r2c3_nbor_dists # Collect donor values col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy() col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy() # Final imputed values r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt) r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) X_imputed = np.array([ [1, 0, 0, 1], [0, r1c1_imp, 1, r1c3_imp], [1, 1, 1, r2c3_imp], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) X = np.array([[0, 0, 0, na], [1, 1, 1, na], [2, 2, na, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [na, 7, 7, 7]]) dist = pairwise_distances(X, metric="nan_euclidean", squared=False, missing_values=na) # Calculate weights r0c3_w = 1.0 / dist[0, 2:-1] r1c3_w = 1.0 / dist[1, 2:-1] r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] r7c0_w = 1.0 / dist[7, 2:7] # Calculate weighted averages r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) r7c0 = np.average(X[2:7, 0], weights=r7c0_w) X_imputed = np.array([[0, 0, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7]]) imputer_comp_wt = KNNImputer(missing_values=na, weights="distance") assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)