def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN # 2-dimensional dataset for easy distance matrix computation X, y = make_blobs(n_samples=nrows, cluster_std=0.01, n_features=2, random_state=0) # Precompute distances X_dist = pairwise_distances(X).astype(datatype) eps = 1 cuml_dbscan = cuDBSCAN(eps=eps, min_samples=2, metric='precomputed', max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X_dist, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=2, metric='precomputed', algorithm="brute") sk_labels = sk_dbscan.fit_predict(X_dist) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples): X, y = make_blobs(n_samples, centers=1, cluster_std=8.0, center_box=(-100.0, 100.0), random_state=8) X = X.astype(datatype) handle, stream = get_handle(use_handle) eps = 0.5 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) eps = 1 cuml_dbscan = cuDBSCAN(eps=eps, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_dbscan_default(name, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN eps = 0.5 default_base = { 'quantile': .3, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = 500 pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_core_point_prop3(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a two-barred (orhodox) cross or # two stars sharing a link: # . . # . . . . . # . . # There are 2 core-points but they are not reachable from each other # So there should be two clusters. # However, the link that is shared between the stars # actually has an ambiguous label (to the best of my knowledge) # as it will depend on the order in which we process the core-points. # So we exclude that point from the comparison with sklearn # TODO: the above text does not correspond to the actual test! X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1], [4, -1], [5, 0], [2, 0]], dtype=np.float32) cuml_dbscan = cuDBSCAN(**params) cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(**params) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_core_point_prop2(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a long two-barred (orhodox) cross or # two stars next to each other: # . . # . . . . . . # . . # There are 2 core-points but they are not reachable from each other # So there should be two clusters, both in the form of a plus/star X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0], [4, 1], [4, -1], [5, 0]], dtype=np.float32) cuml_dbscan = cuDBSCAN(**params) cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(**params) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_core_point_prop1(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a latin cross or a star with a chain: # . # . . . . . # . # There is 1 core-point (intersection of the bars) # and the two points to the very right are not reachable from it # So there should be one cluster (the plus/star on the left) # and two noise points X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0]], dtype=np.float32) cuml_dbscan = cuDBSCAN(**params) cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(**params) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): if nrows == 500000 and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32 else: pytest.skip("Insufficient GPU memory for this test. " "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) eps = 1 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_dbscan_sklearn_comparison(name, nrows, eps): if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") default_base = { 'quantile': .2, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) if nrows < 500000: sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan_propagation(datatype, out_dtype, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN X, y = make_blobs(5000, centers=1, cluster_std=8.0, center_box=(-100.0, 100.0), random_state=8) X = X.astype(datatype) eps = 0.5 cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)