예제 #1
0
def test_three_clusters(bisecting_strategy):
    """Tries to perform bisect k-means for three clusters to check
    if splitting data is performed correctly.
    """

    # X = np.array([[1, 2], [1, 4], [1, 0],
    #               [10, 2], [10, 4], [10, 0],
    #               [10, 6], [10, 8], [10, 10]])

    # X[0][1] swapped with X[1][1] intentionally for checking labeling
    X = np.array([[1, 2], [10, 4], [1, 0], [10, 2], [1, 4], [10, 0], [10, 6],
                  [10, 8], [10, 10]])
    bisect_means = BisectingKMeans(n_clusters=3,
                                   random_state=0,
                                   bisecting_strategy=bisecting_strategy)
    bisect_means.fit(X)

    expected_centers = [[10, 2], [10, 8], [1, 2]]
    expected_predict = [2, 0]
    expected_labels = [2, 0, 2, 0, 2, 0, 1, 1, 1]

    assert_allclose(expected_centers, bisect_means.cluster_centers_)
    assert_array_equal(expected_predict, bisect_means.predict([[0, 0], [12,
                                                                        3]]))
    assert_array_equal(expected_labels, bisect_means.labels_)
예제 #2
0
def test_wrong_params(param, match):
    """Test Exceptions at check_params function."""
    rng = np.random.RandomState(0)
    X = rng.rand(5, 2)

    with pytest.raises(ValueError, match=match):
        bisect_means = BisectingKMeans(n_clusters=3, **param)
        bisect_means.fit(X)
def test_bisecting_kmeans_update_centroids():
    bisection_kmeans = BisectingKMeans(max_n_clusters)

    target_label = 2
    bisection_kmeans.centroids = np.array([[2], [5], [8.33]])
    sub_centroids = np.array([[7.5], [10]])

    bisection_kmeans._update_centroids(sub_centroids, target_label)
    assert_array_equal(bisection_kmeans.centroids, np.array([[2], [5], [7.5], [10]]))
def test_bisecting_kmeans_update_labels(sub_labels, expected_labels):

    bisecting_kmeans = BisectingKMeans(max_n_clusters)

    bisecting_kmeans.labels_ = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    new_label = 3
    target_label_indices = np.array([6, 7, 8])
    bisecting_kmeans._update_labels(sub_labels, target_label_indices, new_label)
    assert_array_equal(bisecting_kmeans.labels_, expected_labels)
예제 #5
0
def test_n_clusters(n_clusters):
    """Test if resulting labels are in range [0, n_clusters - 1]."""

    rng = np.random.RandomState(0)
    X = rng.rand(10, 2)

    bisect_means = BisectingKMeans(n_clusters=n_clusters, random_state=0)
    bisect_means.fit(X)

    assert_array_equal(np.unique(bisect_means.labels_), np.arange(n_clusters))
예제 #6
0
def test_dtype_preserved(is_sparse, global_dtype):
    """Check that centers dtype is the same as input data dtype."""
    rng = np.random.RandomState(0)
    X = rng.rand(10, 2).astype(global_dtype, copy=False)

    if is_sparse:
        X[X < 0.8] = 0
        X = sp.csr_matrix(X)

    km = BisectingKMeans(n_clusters=3, random_state=0)
    km.fit(X)

    assert km.cluster_centers_.dtype == global_dtype
예제 #7
0
def test_one_cluster():
    """Test single cluster."""

    X = np.array([[1, 2], [10, 2], [10, 8]])

    bisect_means = BisectingKMeans(n_clusters=1, random_state=0).fit(X)

    # All labels from fit or predict should be equal 0
    assert all(bisect_means.labels_ == 0)
    assert all(bisect_means.predict(X) == 0)

    assert_allclose(bisect_means.cluster_centers_,
                    X.mean(axis=0).reshape(1, -1))
예제 #8
0
def test_fit_predict(is_sparse):
    """Check if labels from fit(X) method are same as from fit(X).predict(X)."""
    rng = np.random.RandomState(0)

    X = rng.rand(10, 2)

    if is_sparse:
        X[X < 0.8] = 0
        X = sp.csr_matrix(X)

    bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
    bisect_means.fit(X)

    assert_array_equal(bisect_means.labels_, bisect_means.predict(X))
예제 #9
0
def test_float32_float64_equivalence(is_sparse):
    """Check that the results are the same between float32 and float64."""
    rng = np.random.RandomState(0)
    X = rng.rand(10, 2)

    if is_sparse:
        X[X < 0.8] = 0
        X = sp.csr_matrix(X)

    km64 = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
    km32 = BisectingKMeans(n_clusters=3,
                           random_state=0).fit(X.astype(np.float32))

    assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
    assert_array_equal(km32.labels_, km64.labels_)
def test_predict_diff_dimention_data():
    bkmeans = BisectingKMeans(max_n_clusters=3)
    X = np.array([[1], [2], [3],
                  [4], [5], [6],
                  [7], [8], [10]])

    bkmeans.fit(X)
    with pytest.raises(ValueError):
        bkmeans.predict(np.array([[1,2]]))
def test_predict(max_n_clusters):
    X, _ = make_blobs(n_samples=500, n_features=10, centers=max_n_clusters, random_state=0)
    clf = BisectingKMeans(max_n_clusters)
    clf.fit(X)
    labels = clf.labels_

    # re-predict labels for training set using predict
    pred = clf.predict(X)
    assert_array_equal(pred, labels)

    # predict centroid labels (this should pass once fit is implemented)
    pred = clf.predict(clf.centroids)
    assert_array_equal(pred, np.arange(clf.max_n_clusters))
예제 #12
0
def test_sparse():
    """Test Bisecting K-Means with sparse data.

    Checks if labels and centers are the same between dense and sparse.
    """

    rng = np.random.RandomState(0)

    X = rng.rand(20, 2)
    X[X < 0.8] = 0
    X_csr = sp.csr_matrix(X)

    bisect_means = BisectingKMeans(n_clusters=3, random_state=0)

    bisect_means.fit(X_csr)
    sparse_centers = bisect_means.cluster_centers_

    bisect_means.fit(X)
    normal_centers = bisect_means.cluster_centers_

    # Check if results is the same for dense and sparse data
    assert_allclose(normal_centers, sparse_centers, atol=1e-8)
def test_euclidean_distance(max_n_clusters):
    clf = BisectingKMeans(max_n_clusters)
    distance = clf._euclidean_distance([1, 0, 1], [0, 1, 1])
    assert distance == 2**(.5)
def test_next_cluster_to_split(scores):
    bisectingKMeans = BisectingKMeans(max_n_clusters=2)
    bisectingKMeans.scores = np.array(scores)
    assert_equal(bisectingKMeans._next_cluster_to_split(), scores.index(max(scores)))
def test_predict_not_fitted():
    bkmeans = BisectingKMeans(max_n_clusters=2)
    X = np.zeros([3])
    with pytest.raises(NotFittedError):
        bkmeans.predict(X)
def test_max_n_clusters_greater_than_input():
    bkmeans = BisectingKMeans(max_n_clusters=10)
    X = np.zeros([3])
    with pytest.raises(ValueError):
        bkmeans.fit(X)
# %%
# BisectingKMeans: divide and cluster
# -----------------------------------
# The new class :class:`cluster.BisectingKMeans` is a variant of :class:`KMeans`, using
# divisive hierarchical clustering. Instead of creating all centroids at once, centroids
# are picked progressively based on a previous clustering: a cluster is split into two
# new clusters repeatedly until the target number of clusters is reached, giving a
# hierarchical structure to the clustering.
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, BisectingKMeans
import matplotlib.pyplot as plt

X, _ = make_blobs(n_samples=1000, centers=2, random_state=0)

km = KMeans(n_clusters=5, random_state=0).fit(X)
bisect_km = BisectingKMeans(n_clusters=5, random_state=0).fit(X)

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].scatter(X[:, 0], X[:, 1], s=10, c=km.labels_)
ax[0].scatter(km.cluster_centers_[:, 0],
              km.cluster_centers_[:, 1],
              s=20,
              c="r")
ax[0].set_title("KMeans")

ax[1].scatter(X[:, 0], X[:, 1], s=10, c=bisect_km.labels_)
ax[1].scatter(bisect_km.cluster_centers_[:, 0],
              bisect_km.cluster_centers_[:, 1],
              s=20,
              c="r")
_ = ax[1].set_title("BisectingKMeans")