Пример #1
0
    def testKMeansFitPredict(self):
        # check that fit.predict gives same result as fit_predict
        algos = ['full', 'elkan']
        seed_max_iter_tols = [
            (0, 2, 1e-7),  # strict non-convergence
            (1, 2, 1e-1),  # loose non-convergence
            (3, 300, 1e-7),  # strict convergence
            (4, 300, 1e-1),  # loose convergence
        ]

        for algo in algos:
            for seed, max_iter, tol in seed_max_iter_tols:
                rng = np.random.RandomState(seed)

                X = make_blobs(n_samples=1000,
                               n_features=10,
                               centers=10,
                               random_state=rng)[0]

                kmeans = KMeans(algorithm=algo,
                                n_clusters=10,
                                random_state=seed,
                                tol=tol,
                                max_iter=max_iter,
                                init='k-means++')

                labels_1 = kmeans.fit(X).predict(X)
                labels_2 = kmeans.fit_predict(X)

                # Due to randomness in the order in which chunks of data are processed when
                # using more than one thread, the absolute values of the labels can be
                # different between the 2 strategies but they should correspond to the same
                # clustering.
                self.assertAlmostEqual(v_measure_score(labels_1, labels_2), 1)
Пример #2
0
def test_k_means_fit_predict(setup, algo, seed, max_iter, tol):
    # check that fit.predict gives same result as fit_predict
    rng = np.random.RandomState(seed)

    X = make_blobs(n_samples=1000, n_features=10, centers=10,
                   random_state=rng)[0]

    kmeans = KMeans(algorithm=algo,
                    n_clusters=10,
                    random_state=seed,
                    tol=tol,
                    max_iter=max_iter,
                    init='k-means++')

    labels_1 = kmeans.fit(X).predict(X)
    labels_2 = kmeans.fit_predict(X)

    # Due to randomness in the order in which chunks of data are processed when
    # using more than one thread, the absolute values of the labels can be
    # different between the 2 strategies but they should correspond to the same
    # clustering.
    assert pytest.approx(v_measure_score(labels_1, labels_2)) == 1