def test_core_point_prop3(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a two-barred (orhodox) cross or # two stars sharing a link: # . . # . . . . . # . . # There are 2 core-points but they are not reachable from each other # So there should be two clusters. # However, the link that is shared between the stars # actually has an ambiguous label (to the best of my knowledge) # as it will depend on the order in which we process the core-points. # So we exclude that point from the comparison with sklearn X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1], [4, -1], [5, 0], [2, 0]], dtype=np.float32) cudbscan = cuDBSCAN(**params) cu_y_pred = cudbscan.fit_predict(X) dbscan = skDBSCAN(**params) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1]) assert (score == 1.0)
def test_core_point_prop2(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a long two-barred (orhodox) cross or # two stars next to each other: # . . # . . . . . . # . . # There are 2 core-points but they are not reachable from each other # So there should be two clusters, both in the form of a plus/star X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0], [4, 1], [4, -1], [5, 0]], dtype=np.float32) cuml_dbscan = cuDBSCAN(**params) cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(**params) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype): # 2-dimensional dataset for easy distance matrix computation X, y = make_blobs(n_samples=nrows, cluster_std=0.01, n_features=2, random_state=0) # Precompute distances X_dist = pairwise_distances(X).astype(datatype) eps = 1 cuml_dbscan = cuDBSCAN(eps=eps, min_samples=2, metric='precomputed', max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X_dist, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=2, metric='precomputed', algorithm="brute") sk_labels = sk_dbscan.fit_predict(X_dist) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan(datatype, input_type, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch) if input_type == 'dataframe': X = pd.DataFrame( {'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cu_labels = cudbscan.fit_predict(X_cudf, out_dtype=out_dtype) else: cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = skdbscan.fit_predict(X) score = adjusted_rand_score(cu_labels, sk_labels) assert score == 1 if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_dbscan_sklearn_comparison(name, nrows): default_base = {'quantile': .3, 'eps': .5, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2} n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5) cu_y_pred, cu_n_clusters = fit_predict(cuml_dbscan, 'cuml_DBSCAN', X) if nrows < 500000: dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_y_pred, sk_n_clusters = fit_predict(dbscan, 'sk_DBSCAN', X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert(score == 1.0)
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = skdbscan.fit_predict(X) score = adjusted_rand_score(cu_labels, sk_labels) assert score == 1 if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch): # max_bytes_per_batch sizes: 10=6 batches, 200=2 batches, 2e6=1 batch X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype=datatype) skdbscan = skDBSCAN(eps=3, min_samples=2) sk_labels = skdbscan.fit_predict(X) handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=3, min_samples=2, max_bytes_per_batch=max_bytes_per_batch) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype) gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype) cu_labels = cudbscan.fit_predict(gdf) else: cu_labels = cudbscan.fit_predict(X) cudbscan.handle.sync() for i in range(X.shape[0]): assert cu_labels[i] == sk_labels[i]
def test_core_point_prop3(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a two-barred (orhodox) cross or # two stars sharing a link: # . . # . . . . . # . . # There are 2 core-points but they are not reachable from each other # So there should be two clusters. # However, the link that is shared between the stars # actually has an ambiguous label (to the best of my knowledge) # as it will depend on the order in which we process the core-points. # So we exclude that point from the comparison with sklearn # TODO: the above text does not correspond to the actual test! X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1], [4, -1], [5, 0], [2, 0]], dtype=np.float32) cuml_dbscan = cuDBSCAN(**params) cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(**params) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_core_point_prop1(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a latin cross or a star with a chain: # . # . . . . . # . # There is 1 core-point (intersection of the bars) # and the two points to the very right are not reachable from it # So there should be one cluster (the plus/star on the left) # and two noise points X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0]], dtype=np.float32) cuml_dbscan = cuDBSCAN(**params) cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(**params) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples): X, y = make_blobs(n_samples, centers=1, cluster_std=8.0, center_box=(-100.0, 100.0), random_state=8) X = X.astype(datatype) handle, stream = get_handle(use_handle) eps = 0.5 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan_sklearn_comparison(name, nrows, eps): default_base = { 'quantile': .2, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5, output_type='numpy') cu_y_pred = cuml_dbscan.fit_predict(X) if nrows < 500000: dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert (score == 1.0) # Check the core points are equal array_equal(cuml_dbscan.core_sample_indices_, dbscan.core_sample_indices_)
def test_dbscan_default(name): default_base = { 'quantile': .3, 'eps': .5, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = 500 pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(output_type='numpy') cu_y_pred = cuml_dbscan.fit_predict(X) dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert (score == 1.0)
def test_dbscan_no_calc_core_point_indices(): params = {'eps': 1.1, 'min_samples': 4} n_samples = 1000 pat = get_pattern("noisy_moons", n_samples) X, y = pat[0] X = StandardScaler().fit_transform(X) # Set calc_core_sample_indices=False cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5, output_type='numpy', calc_core_sample_indices=False) cu_y_pred = cuml_dbscan.fit_predict(X) dbscan = skDBSCAN(**params) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1]) assert (score == 1.0) # Make sure we are None assert (cuml_dbscan.core_sample_indices_ is None)
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) eps = 1 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_dbscan_default(name): default_base = { 'quantile': .3, 'eps': .5, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = 500 pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_dbscan_predict_multiple_streams(): datatype = np.float32 gdf = cudf.DataFrame() gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype) gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype) X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype=datatype) skdbscan = skDBSCAN(eps=3, min_samples=2) sk_labels = skdbscan.fit_predict(X) handle1, stream1 = get_handle(True) handle2, stream2 = get_handle(True) cudbscan1 = cuDBSCAN(handle=handle1, eps=3, min_samples=2) cudbscan2 = cuDBSCAN(handle=handle2, eps=3, min_samples=2) cu_labels1 = cudbscan1.fit_predict(gdf) cu_labels2 = cudbscan2.fit_predict(gdf) cudbscan1.handle.sync() cudbscan2.handle.sync() for i in range(X.shape[0]): assert cu_labels1[i] == sk_labels[i] assert cu_labels2[i] == sk_labels[i]
def test_dbscan_predict(datatype): gdf = cudf.DataFrame() gdf['0']=np.asarray([1,2,2,8,8,25],dtype=datatype) gdf['1']=np.asarray([2,2,3,7,8,80],dtype=datatype) X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype = datatype) print("Calling fit_predict") cudbscan = cuDBSCAN(eps = 3, min_samples = 2) cu_labels = cudbscan.fit_predict(gdf) skdbscan = skDBSCAN(eps = 3, min_samples = 2) sk_labels = skdbscan.fit_predict(X) print(X.shape[0]) for i in range(X.shape[0]): assert cu_labels[i] == sk_labels[i]
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): if nrows == 500000 and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32 else: pytest.skip("Insufficient GPU memory for this test. " "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) eps = 1 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_dbscan_predict(datatype, input_type): X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype=datatype) skdbscan = skDBSCAN(eps=3, min_samples=2) sk_labels = skdbscan.fit_predict(X) cudbscan = cuDBSCAN(eps=3, min_samples=2) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype) gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype) cu_labels = cudbscan.fit_predict(gdf) else: cu_labels = cudbscan.fit_predict(X) for i in range(X.shape[0]): assert cu_labels[i] == sk_labels[i]
def test_dbscan_propagation(datatype, use_handle, out_dtype): X, y = make_blobs(5000, centers=1, cluster_std=8.0, center_box=(-100.0, 100.0), random_state=8) X = X.astype(datatype) handle, stream = get_handle(use_handle) cuml_dbscan = cuDBSCAN(handle=handle, eps=0.5, min_samples=5, output_type='numpy') cu_y_pred = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) dbscan = skDBSCAN(eps=0.5, min_samples=5) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert (score == 1.0)
def test_core_point_prop2(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a long two-barred (orhodox) cross or # two stars next to each other: # . . # . . . . . . # . . # There are 2 core-points but they are not reachable from each other # So there should be two clusters, both in the form of a plus/star X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0], [4, 1], [4, -1], [5, 0]], dtype=np.float32) cudbscan = cuDBSCAN(**params) cu_y_pred = cudbscan.fit_predict(X) dbscan = skDBSCAN(**params) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert (score == 1.0)
def test_core_point_prop1(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a latin cross or a star with a chain: # . # . . . . . # . # There is 1 core-point (intersection of the bars) # and the two points to the very right are not reachable from it # So there should be one cluster (the plus/star on the left) # and two noise points X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0]], dtype=np.float32) cudbscan = cuDBSCAN(**params) cu_y_pred = cudbscan.fit_predict(X) dbscan = skDBSCAN(**params) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert (score == 1.0)
def test_dbscan_sklearn_comparison(name, nrows, eps): if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") default_base = { 'quantile': .2, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) if nrows < 500000: sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan_sklearn_comparison(name, use_handle): # Skipping datasets of known discrepancies in PR83 while they are corrected default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, 1500) params = default_base.copy() params.update(pat[1]) dbscan = skDBSCAN(eps=params['eps'], min_samples=5) handle, stream = get_handle(use_handle) cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5) X, y = pat[0] X = StandardScaler().fit_transform(X) clustering_algorithms = (('sk_DBSCAN', dbscan), ('cuml_DBSCAN', cuml_dbscan)) sk_y_pred, sk_n_clusters = fit_predict(clustering_algorithms[0][1], clustering_algorithms[0][0], X) cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1], clustering_algorithms[1][0], X) cuml_dbscan.handle.sync() assert (sk_n_clusters == cu_n_clusters) clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
def test_dbscan(datatype, input_type, use_handle, nrows, ncols, max_bytes_per_batch): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=0.5, min_samples=2, max_bytes_per_batch=max_bytes_per_batch) if input_type == 'dataframe': X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cu_labels = cudbscan.fit_predict(X_cudf) else: cu_labels = cudbscan.fit_predict(X) if nrows < 500000: skdbscan = skDBSCAN(eps=0.5, min_samples=2) sk_labels = skdbscan.fit_predict(X) for i in range(X.shape[0]): assert cu_labels[i] == sk_labels[i]
def test_dbscan_out_dtype_fails_invalid_input(): X, _ = make_blobs(n_samples=100) cudbscan = cuDBSCAN() cudbscan.fit_predict(X, out_dtype="bad_input")
def test_dbscan_on_empty_array(): X = np.array([]) cuml_dbscan = cuDBSCAN() assert_raises(ValueError, cuml_dbscan.fit, X)
def test_dbscan_out_dtype_fails_invalid_input(): X, _ = make_blobs(n_samples=500) cuml_dbscan = cuDBSCAN(output_type='numpy') cuml_dbscan.fit_predict(X, out_dtype="bad_input")