Пример #1
0
    def test_speed_vs_sk(self):
        from h2o4gpu.cluster import KMeansSklearn as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        # Warmup - during first call CUDA kernels take ~2sec to load
        kmeans_h2o.fit(X)
        start_h2o = time.time()
        kmeans_h2o.fit(X)
        end_h2o = time.time()

        if os.getenv("CHECKPERFORMANCE") is not None:
            kmeans_sk = skKMeans(n_init=1,
                                 n_clusters=centers,
                                 algorithm='full',
                                 n_jobs=-1)
            start_sk = time.time()
            kmeans_sk.fit(X)
            end_sk = time.time()
            assert end_h2o - start_h2o <= end_sk - start_sk
Пример #2
0
    def _test_accuracy(self,
                       order,
                       n_samples=500000,
                       centers=10,
                       n_features=2):
        from h2o4gpu.cluster import KMeansSklearn as skKMeans
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    n_features=n_features,
                                    cluster_std=2.,
                                    random_state=42)

        X = np.asanyarray(X, order=order)

        kmeans_h2o = KMeans(n_gpus=1,
                            n_clusters=centers,
                            random_state=42,
                            verbose=1000)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        # TODO: it's failing with lower rtol, find out why it's so inaccurate
        assert np.allclose(accuracy_h2o, accuracy_sk, rtol=0.1, atol=0.05), \
            'Accuracy error {0} {1} n_samples={2}, centers={3}, n_features={4}, order={5}'.format(
            accuracy_h2o, accuracy_sk, n_samples, centers, n_features, order)
Пример #3
0
    def test_accuracy(self):
        from h2o4gpu.cluster import KMeansSklearn as skKMeans
        n_samples = 500000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1