Exemplo n.º 1
0
def test_hdbscan_cluster_patterns_extract_clusters(
        dataset, nrows, connectivity, cluster_selection_epsilon,
        cluster_selection_method, min_cluster_size, allow_single_cluster,
        max_cluster_size, min_samples):

    # This also tests duplicate data points
    X, y = get_pattern(dataset, nrows)[0]

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    cuml_agg._extract_clusters(sk_agg.condensed_tree_)

    assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0
    assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test),
                       sk_agg.probabilities_)
Exemplo n.º 2
0
def test_hdbscan_sklearn_datasets(dataset, connectivity,
                                  cluster_selection_epsilon,
                                  cluster_selection_method,
                                  min_samples_cluster_size_bounds,
                                  allow_single_cluster):

    min_samples, min_cluster_size, max_cluster_size = \
        min_samples_cluster_size_bounds

    X = dataset.data

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       gen_min_span_tree=True,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    cuml_agg.fit(X)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    assert_condensed_trees(sk_agg, min_cluster_size)
    assert_cluster_counts(sk_agg, cuml_agg)

    assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_)))
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.85)

    assert np.allclose(np.sort(sk_agg.cluster_persistence_),
                       np.sort(cuml_agg.cluster_persistence_),
                       rtol=0.1,
                       atol=0.1)
Exemplo n.º 3
0
def test_hdbscan_blobs(nrows, ncols, nclusters, connectivity,
                       cluster_selection_epsilon, cluster_selection_method,
                       allow_single_cluster, min_cluster_size,
                       max_cluster_size, min_samples):

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      cluster_std=0.7,
                      shuffle=False,
                      random_state=42)

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    cuml_agg.fit(X)
    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    assert_condensed_trees(sk_agg, min_cluster_size)
    assert_cluster_counts(sk_agg, cuml_agg)

    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) >= 0.95)
    assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_)))

    assert np.allclose(np.sort(sk_agg.cluster_persistence_),
                       np.sort(cuml_agg.cluster_persistence_),
                       rtol=0.01,
                       atol=0.01)
Exemplo n.º 4
0
def test_hdbscan_empty_cluster_tree():

    raw_tree = np.recarray(shape=(5, ),
                           formats=[np.intp, np.intp, float, np.intp],
                           names=('parent', 'child', 'lambda_val',
                                  'child_size'))

    raw_tree['parent'] = np.asarray([5, 5, 5, 5, 5])
    raw_tree['child'] = [0, 1, 2, 3, 4]
    raw_tree['lambda_val'] = [1.0, 1.0, 1.0, 1.0, 1.0]
    raw_tree['child_size'] = [1, 1, 1, 1, 1]

    condensed_tree = CondensedTree(raw_tree, 0.0, True)

    cuml_agg = HDBSCAN(allow_single_cluster=True,
                       cluster_selection_method="eom")
    cuml_agg._extract_clusters(condensed_tree)

    # We just care that all points are assigned to the root cluster
    assert np.sum(cuml_agg.labels_test.to_output("numpy")) == 0
Exemplo n.º 5
0
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity,
                                  cluster_selection_epsilon,
                                  cluster_selection_method, min_cluster_size,
                                  allow_single_cluster, max_cluster_size,
                                  min_samples):

    # This also tests duplicate data points
    X, y = get_pattern(dataset, nrows)[0]

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    cuml_agg.fit(X)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    assert_condensed_trees(sk_agg, min_cluster_size)
    assert_cluster_counts(sk_agg, cuml_agg)

    assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_)))
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95)

    assert np.allclose(np.sort(sk_agg.cluster_persistence_),
                       np.sort(cuml_agg.cluster_persistence_),
                       rtol=0.1,
                       atol=0.1)
Exemplo n.º 6
0
def test_hdbscan_core_dists_bug_4054():
    """
    This test explicitly verifies that the MRE from
    https://github.com/rapidsai/cuml/issues/4054
    matches the reference impl
    """

    X, y = datasets.make_moons(n_samples=10000, noise=0.12, random_state=0)

    cu_labels_ = HDBSCAN(min_samples=25, min_cluster_size=25).fit_predict(X)
    sk_labels_ = hdbscan.HDBSCAN(min_samples=25,
                                 min_cluster_size=25,
                                 approx_min_span_tree=False).fit_predict(X)

    assert adjusted_rand_score(cu_labels_, sk_labels_) > 0.99
Exemplo n.º 7
0
def test_hdbscan_plots():

    X, y = make_blobs(n_samples=int(100),
                      n_features=100,
                      centers=10,
                      cluster_std=0.7,
                      shuffle=False,
                      random_state=42)

    cuml_agg = HDBSCAN(gen_min_span_tree=True)
    cuml_agg.fit(X)

    assert cuml_agg.condensed_tree_ is not None
    assert cuml_agg.minimum_spanning_tree_ is not None
    assert cuml_agg.single_linkage_tree_ is not None

    cuml_agg = HDBSCAN(gen_min_span_tree=False)
    cuml_agg.fit(X)

    assert cuml_agg.minimum_spanning_tree_ is None