예제 #1
0
def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype,
                            client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    # 2-dimensional dataset for easy distance matrix computation
    X, y = make_blobs(n_samples=nrows,
                      cluster_std=0.01,
                      n_features=2,
                      random_state=0)

    # Precompute distances
    X_dist = pairwise_distances(X).astype(datatype)

    eps = 1
    cuml_dbscan = cuDBSCAN(eps=eps,
                           min_samples=2,
                           metric='precomputed',
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X_dist, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps,
                         min_samples=2,
                         metric='precomputed',
                         algorithm="brute")
    sk_labels = sk_dbscan.fit_predict(X_dist)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
예제 #2
0
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples):
    X, y = make_blobs(n_samples,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    handle, stream = get_handle(use_handle)
    eps = 0.5
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
예제 #3
0
def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype,
                client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    eps = 1
    cuml_dbscan = cuDBSCAN(eps=eps,
                           min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
예제 #4
0
def test_dbscan_default(name, client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    eps = 0.5
    default_base = {
        'quantile': .3,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
예제 #5
0
def test_core_point_prop3():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a two-barred (orhodox) cross or
    # two stars sharing a link:
    #   .   .
    # . . . . .
    #   .   .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters.
    # However, the link that is shared between the stars
    # actually has an ambiguous label (to the best of my knowledge)
    # as it will depend on the order in which we process the core-points.
    # So we exclude that point from the comparison with sklearn

    # TODO: the above text does not correspond to the actual test!

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1],
                  [4, -1], [5, 0], [2, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
예제 #6
0
def test_core_point_prop2():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a long two-barred (orhodox) cross or
    # two stars next to each other:
    #   .     .
    # . . . . . .
    #   .     .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters, both in the form of a plus/star

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0],
                  [4, 1], [4, -1], [5, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
예제 #7
0
def test_core_point_prop1():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a latin cross or a star with a chain:
    #   .
    # . . . . .
    #   .
    # There is 1 core-point (intersection of the bars)
    # and the two points to the very right are not reachable from it
    # So there should be one cluster (the plus/star on the left)
    # and two noise points

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
예제 #8
0
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    if nrows == 500000 and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test. "
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    handle, stream = get_handle(use_handle)

    eps = 1
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
예제 #9
0
def test_dbscan_sklearn_comparison(name, nrows, eps):
    if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)
예제 #10
0
def test_dbscan_propagation(datatype, out_dtype, client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    X, y = make_blobs(5000,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    eps = 0.5
    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)