Пример #1
0
def test_hdbscan_cluster_patterns_extract_clusters(
        dataset, nrows, connectivity, cluster_selection_epsilon,
        cluster_selection_method, min_cluster_size, allow_single_cluster,
        max_cluster_size, min_samples):

    # This also tests duplicate data points
    X, y = get_pattern(dataset, nrows)[0]

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    cuml_agg._extract_clusters(sk_agg.condensed_tree_)

    assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0
    assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test),
                       sk_agg.probabilities_)
Пример #2
0
def test_kmeans_sklearn_comparison_default(name, nrows, random_state):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'],
                              random_state=random_state,
                              n_init=10,
                              output_type='numpy')

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X)
    cu_score = adjusted_rand_score(cu_y_pred, y)
    kmeans = cluster.KMeans(random_state=random_state,
                            n_clusters=params['n_clusters'])
    sk_y_pred = kmeans.fit_predict(X)
    sk_score = adjusted_rand_score(sk_y_pred, y)

    assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2
Пример #3
0
def test_dbscan_default(name, client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    eps = 0.5
    default_base = {
        'quantile': .3,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Пример #4
0
def test_dbscan_no_calc_core_point_indices():

    params = {'eps': 1.1, 'min_samples': 4}
    n_samples = 1000
    pat = get_pattern("noisy_moons", n_samples)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    # Set calc_core_sample_indices=False
    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy',
                           calc_core_sample_indices=False)
    cuml_dbscan.fit_predict(X)

    # Make sure we are None
    assert (cuml_dbscan.core_sample_indices_ is None)
Пример #5
0
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity,
                                  cluster_selection_epsilon,
                                  cluster_selection_method, min_cluster_size,
                                  allow_single_cluster, max_cluster_size,
                                  min_samples):

    # This also tests duplicate data points
    X, y = get_pattern(dataset, nrows)[0]

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    cuml_agg.fit(X)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    assert_condensed_trees(sk_agg, min_cluster_size)
    assert_cluster_counts(sk_agg, cuml_agg)

    assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_)))
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95)

    assert np.allclose(np.sort(sk_agg.cluster_persistence_),
                       np.sort(cuml_agg.cluster_persistence_),
                       rtol=0.1,
                       atol=0.1)
Пример #6
0
def test_dbscan_sklearn_comparison(name, nrows, eps):
    if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)