示例#1
0
def test_dbscan():
    import cudf
    from cuml import DBSCAN

    # Create and populate a GPU DataFrame
    gdf_float = cudf.DataFrame()
    gdf_float['0'] = [1.0, 2.0, 5.0]
    gdf_float['1'] = [4.0, 2.0, 1.0]
    gdf_float['2'] = [4.0, 2.0, 1.0]

    # Setup and fit clusters
    dbscan_float = DBSCAN(eps=1.0, min_samples=1)
    dbscan_float.fit(gdf_float)

    actualOutput = str(dbscan_float.labels_)
    expectedOutput = """0    0
1    1
2    2
dtype: int32"""

    assert actualOutput == expectedOutput
示例#2
0
def cluster(gdf, eps, minSamples):
    # cpu clustering
    # clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom')
    # clusterer.fit(data)

    logging.info("cuml.SCANDB Clustering - eps=%0.3f, samples=%d", eps, minSamples)
    # GPU clustering
    result = DBSCAN(eps=eps,
                    min_samples=minSamples,
                    verbose=False,
                    calc_core_sample_indices=True,
                    output_type='cudf').fit(gdf)

    metric1 = result.labels_.max()
    metric2 = len(result.core_sample_indices_)

    mlflow.log_metric('labels', metric1)
    mlflow.log_metric('core_samples', metric2)

    gpu_mem(0)
    logging.debug("cuml.DBSCAN - dims: %d, max distance: %0.3f, min samples: %d, labels: %d, core_samples: %d",
                  ndims, eps, minSamples, metric1, metric2)
示例#3
0
                    type=float,
                    default=10.,
                    help='Radius of neighborhood of a point')
parser.add_argument('-m',
                    '--min-samples',
                    default=5,
                    type=int,
                    help='The minimum number of samples required in a '
                    'neighborhood to consider a point a core point')
params = bench.parse_args(parser)

# Load generated data
X, _, _, _ = bench.load_data(params)

# Create our clustering object
dbscan = DBSCAN(eps=params.eps, min_samples=params.min_samples)

# Time fit
time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
labels = dbscan.labels_

X_host = bench.convert_to_numpy(X)
labels_host = bench.convert_to_numpy(labels)

acc = davies_bouldin_score(X_host, labels_host)
params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0)

bench.print_output(library='cuml',
                   algorithm='dbscan',
                   stages=['training'],
                   params=params,