示例#1
0
    def test_fit_iris_backupsklearn(self):
        X = load_iris().data
        clusters = 4

        print("Running model")
        model = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X)

        assert model.cluster_centers_.shape == (X.shape[1], clusters)
        print("passed 1")

        print("Running model_rerun")
        model_rerun = KMeans(n_gpus=1, n_clusters=clusters, random_state=123, init=model.cluster_centers_, n_init=1).fit(X)
        import sys
        print(model_rerun.cluster_centers_)
        sys.stdout.flush()

        # Choosing initial clusters for sklearn should yield similar result
        assert np.allclose(
           model.cluster_centers_, model_rerun.cluster_centers_
        )

        # sklearn directly or our indirect should be same (and is)
        from sklearn.cluster import KMeans as KMeans_test

        print("Running model_rerun2")
        model_rerun2 = KMeans_test(n_clusters=clusters, random_state=123, init=model.cluster_centers_, n_init=1).fit(X)
        print(model_rerun2.cluster_centers_)
        sys.stdout.flush()

        assert np.allclose(
            model_rerun.cluster_centers_, model_rerun2.cluster_centers_
        )
        print("passed 2")
示例#2
0
    def test_fit_iris(self):
        X = load_iris().data
        clusters = 4
        model = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X)

        assert model.cluster_centers_.shape == (X.shape[1], clusters)

        model_rerun = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X)

        # Same random_state should yield same results
        assert np.allclose(
            model.cluster_centers_, model_rerun.cluster_centers_
        )

        model_rerun2 = model_rerun.fit(X)

        # Multiple invocations of fit with the same random_state
        # also should produce the same result
        assert np.allclose(
            model_rerun.cluster_centers_, model_rerun2.cluster_centers_
        )

        model_all = KMeans(n_clusters=clusters, random_state=123).fit(X)

        # Multi GPU should yield same result as single GPU
        assert np.allclose(
            model.cluster_centers_, model_all.cluster_centers_
        )
示例#3
0
    def test_fit_iris(self):
        X = load_iris().data
        clusters = 4
        model = KMeans(n_gpus=1, n_clusters=clusters, random_state=123).fit(X)

        assert model.cluster_centers_.shape == (X.shape[1], clusters)

        model_rerun = KMeans(n_gpus=1, n_clusters=clusters,
                             random_state=123).fit(X)

        # Same random_state should yield same results
        assert np.allclose(model.cluster_centers_,
                           model_rerun.cluster_centers_)

        model_rerun2 = model_rerun.fit(X)

        # Multiple invocations of fit with the same random_state
        # also should produce the same result
        assert np.allclose(model_rerun.cluster_centers_,
                           model_rerun2.cluster_centers_)

        model_all = KMeans(n_clusters=clusters, random_state=123).fit(X)

        # Multi GPU should yield same result as single GPU
        assert np.allclose(model.cluster_centers_, model_all.cluster_centers_)
示例#4
0
    def _test_accuracy(self,
                       order,
                       n_samples=500000,
                       centers=10,
                       n_features=2):
        from h2o4gpu.cluster import KMeansSklearn as skKMeans
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    n_features=n_features,
                                    cluster_std=2.,
                                    random_state=42)

        X = np.asanyarray(X, order=order)

        kmeans_h2o = KMeans(n_gpus=1,
                            n_clusters=centers,
                            random_state=42,
                            verbose=1000)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        # TODO: it's failing with lower rtol, find out why it's so inaccurate
        assert np.allclose(accuracy_h2o, accuracy_sk, rtol=0.1, atol=0.05), \
            'Accuracy error {0} {1} n_samples={2}, centers={3}, n_features={4}, order={5}'.format(
            accuracy_h2o, accuracy_sk, n_samples, centers, n_features, order)
示例#5
0
    def test_transform_iris(self):
        X = load_iris().data
        model = KMeans(n_gpus=1, n_clusters=4, random_state=1234567).fit(X)

        labels_from_trans = list(
            map(lambda x: np.argmin(x), model.transform(X)))

        assert all(labels_from_trans == model.predict(X))
示例#6
0
    def test_fit_iris_precision(self):
        X_f64 = load_iris().data
        X_f32 = X_f64.astype(np.float32)
        kmeans = KMeans(n_gpus=1, n_clusters=4, random_state=12345)
        model_f64_labels = kmeans.fit(X_f64).predict(X_f64)
        model_f32_labels = kmeans.fit(X_f32).predict(X_f32)

        assert all(model_f64_labels == model_f32_labels)
示例#7
0
    def test_fit_iris_precision(self):
        X_f64 = load_iris().data
        X_f32 = X_f64.astype(np.float32)
        kmeans = KMeans(n_gpus=1, n_clusters=4, random_state=12345)
        model_f64_labels = kmeans.fit(X_f64).predict(X_f64)
        model_f32_labels = kmeans.fit(X_f32).predict(X_f32)

        assert all(model_f64_labels == model_f32_labels)
示例#8
0
    def test_fit_vs_sk_iris(self):
        X = load_iris().data
        model = KMeans(n_gpus=1, n_clusters=4, random_state=1234).fit(X)

        h2o_labels = model.predict(X)
        sk_lables = model.sklearn_predict(X)

        assert all(h2o_labels == sk_lables)
示例#9
0
    def test_fit_vs_sk_iris(self):
        X = load_iris().data
        model = KMeans(n_gpus=1, n_clusters=4, random_state=1234).fit(X)

        h2o_labels = model.predict(X)
        sk_lables = model.sklearn_predict(X)

        assert all(h2o_labels == sk_lables)
示例#10
0
    def test_transform_iris(self):
        X = load_iris().data
        model = KMeans(n_gpus=1, n_clusters=4, random_state=1234567).fit(X)

        labels_from_trans = list(
            map(lambda x: np.argmin(x), model.transform(X))
        )

        assert all(labels_from_trans == model.predict(X))
示例#11
0
    def test_fit_i32_vs_f32(self):
        X_f64 = np.array([[1., 2.], [1., 4.], [1., 0.], [4., 2.], [4., 4.], [4., 0.]])
        X_f32 = X_f64.astype(np.float32)
        X_i32 = X_f64.astype(np.int32)
        kmeans = KMeans(n_gpus=1, n_clusters=2, random_state=123)

        model_f64_labels = kmeans.fit(X_f64).predict(X_f64)
        model_f32_labels = kmeans.fit(X_f32).predict(X_f32)
        model_i32_labels = kmeans.fit(X_i32).predict(X_i32)

        assert all(model_f64_labels == model_f32_labels)
        assert all(model_f32_labels == model_i32_labels)
示例#12
0
    def test_accuracy(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 500000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1
示例#13
0
    def test_accuracy(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random',
                             random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1
示例#14
0
    def test_speed_vs_sk(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random')
        start_sk = time.time()
        kmeans_sk.fit(X)
        end_sk = time.time()

        print(end_h2o - start_h2o)
        print(end_sk - start_sk)
示例#15
0
    def test_speed_vs_sk(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        # Warmup - during first call CUDA kernels take ~2sec to load
        kmeans_h2o.fit(X)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        kmeans_sk = skKMeans(n_init=1,
                             n_clusters=centers,
                             init='random',
                             algorithm='full',
                             n_jobs=-1)
        start_sk = time.time()
        kmeans_sk.fit(X)
        end_sk = time.time()

        assert end_h2o - start_h2o <= end_sk - start_sk
示例#16
0
    def test_speed_vs_sk(self, order):
        from h2o4gpu.cluster import KMeansSklearn as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        # Warmup - during first call CUDA kernels take ~2sec to load
        kmeans_h2o.fit(X)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        if os.getenv("CHECKPERFORMANCE") is not None:
            kmeans_sk = skKMeans(n_init=1,
                                 n_clusters=centers,
                                 algorithm='full',
                                 n_jobs=-1)
            start_sk = time.time()
            kmeans_sk.fit(X)
            end_sk = time.time()
            assert end_h2o - start_h2o <= end_sk - start_sk
示例#17
0
    def test_predict_iris(self):
        X = load_iris().data
        model = KMeans(n_gpus=1, n_clusters=4, random_state=123456).fit(X)

        assert all(model.labels_ == model.predict(X))
示例#18
0
    def test_predict_iris(self, order):
        X = np.asanyarray(load_iris().data, order=order)
        model = KMeans(n_gpus=1, n_clusters=4, random_state=123456).fit(X)

        assert all(model.labels_ == model.predict(X))
示例#19
0
    def test_predict_iris(self):
        X = load_iris().data
        model = KMeans(n_gpus=1, n_clusters=4, random_state=123456).fit(X)

        assert all(model.labels_ == model.predict(X))