def test_agglomerative_clustering(): """ Check that we obtain the correct number of clusters with agglomerative clustering. """ rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix( connectivity.toarray()[:10, :10]), linkage=linkage) assert_raises(ValueError, clustering.fit, X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") assert_raises(ValueError, clustering.fit, X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
def test_agglomerative_clustering(): """ Check that we obtain the correct number of clusters with agglomerative clustering. """ rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, memory=mkdtemp(), linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) # Turn caching off now clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) np.testing.assert_array_equal(clustering.labels_, labels) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]), linkage=linkage) assert_raises(ValueError, clustering.fit, X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") assert_raises(ValueError, clustering.fit, X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering(n_clusters=10, connectivity=np.ones( (n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1)
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average", "single"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage, ) clustering.fit(X) labels = clustering.labels_ assert np.size(np.unique(labels)) == 10 finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert np.size(np.unique(clustering.labels_)) == 10 # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]), linkage=linkage, ) with pytest.raises(ValueError): clustering.fit(X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), metric="manhattan", linkage="ward", ) with pytest.raises(ValueError): clustering.fit(X) # Test using another metric than euclidean works with linkage complete for metric in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), metric=metric, linkage="complete", ) clustering.fit(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=None, metric=metric, linkage="complete") clustering2.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, metric="precomputed", linkage="complete", ) clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)