예제 #1
0
def _load_dataset(dataset, n_rows):

    if dataset == "make_blobs":
        local_X, local_y = make_blobs(n_samples=n_rows,
                                      n_features=10,
                                      centers=200,
                                      cluster_std=0.8,
                                      random_state=42)

        local_X = cp.asarray(local_X)
        local_y = cp.asarray(local_y)

    else:
        if dataset == "digits":
            local_X, local_y = load_digits(return_X_y=True)

        else:  # dataset == "iris"
            local_X, local_y = load_iris(return_X_y=True)

        local_X = cp.asarray(local_X)
        local_y = cp.asarray(local_y)

        local_X = local_X.repeat(math.ceil(n_rows / len(local_X)), axis=0)
        local_y = local_y.repeat(math.ceil(n_rows / len(local_y)), axis=0)

        # Add some gaussian noise
        local_X += cp.random.standard_normal(local_X.shape, dtype=cp.float32)

    return local_X, local_y
예제 #2
0
def test_score(nrows, ncols, nclusters):

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=0.01,
                      random_state=10)

    cuml_kmeans = cuml.KMeans(verbose=1,
                              init="k-means||",
                              n_clusters=nclusters,
                              random_state=10)

    cuml_kmeans.fit(X)

    actual_score = cuml_kmeans.score(X)

    predictions = cuml_kmeans.predict(X)

    centers = cp.array(cuml_kmeans.cluster_centers_.as_gpu_matrix())

    expected_score = 0
    for idx, label in enumerate(predictions):

        x = X[idx]
        y = centers[label]

        dist = np.sqrt(np.sum((x - y)**2))
        expected_score += dist**2

    assert actual_score + SCORE_EPS \
        >= (-1*expected_score) \
        >= actual_score - SCORE_EPS
예제 #3
0
def test_single_linkage_sklearn_compare(nrows, ncols, nclusters, k,
                                        connectivity):

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=1.0,
                      shuffle=False,
                      random_state=42)

    cuml_agg = AgglomerativeClustering(n_clusters=nclusters,
                                       affinity='euclidean',
                                       linkage='single',
                                       n_neighbors=k,
                                       connectivity=connectivity)

    try:
        cuml_agg.fit(X)
    except Exception:
        cuml_agg.fit(X)

    sk_agg = cluster.AgglomerativeClustering(n_clusters=nclusters,
                                             affinity='euclidean',
                                             linkage='single')
    sk_agg.fit(cp.asnumpy(X))

    # Cluster assignments should be exact, even though the actual
    # labels may differ
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) == 1.0)
    assert (cuml_agg.n_connected_components_ == sk_agg.n_connected_components_)
    assert (cuml_agg.n_leaves_ == sk_agg.n_leaves_)
    assert (cuml_agg.n_clusters_ == sk_agg.n_clusters_)
예제 #4
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype, algo):
    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(algorithm=algo)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.to_numpy()
    else:
        assert isinstance(neigh_ind, cp.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #5
0
def test_weighted_kmeans(nrows, ncols, nclusters, max_weight, random_state):

    # Using fairly high variance between points in clusters
    cluster_std = 1.0
    np.random.seed(random_state)

    # set weight per sample to be from 1 to max_weight
    wt = np.random.randint(1, high=max_weight, size=nrows)

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X, sample_weight=wt)
    cu_score = cuml_kmeans.score(X)

    sk_kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
    sk_kmeans.fit(cp.asnumpy(X), sample_weight=wt)
    sk_score = sk_kmeans.score(cp.asnumpy(X))

    assert abs(cu_score - sk_score) <= cluster_std * 1.5
예제 #6
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype, algo):
    if algo == "ivfpq":
        pytest.xfail("""See Memory access error in IVFPQ :
                        https://github.com/rapidsai/cuml/issues/3318""")

    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      n_features=ncols, random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(algorithm=algo)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(neigh_ind, cp.core.core.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #7
0
def test_score(nrows, ncols, nclusters):

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=0.01,
                      shuffle=False,
                      random_state=10)

    cuml_kmeans = cuml.KMeans(init="k-means||",
                              n_clusters=nclusters,
                              random_state=10,
                              output_type='numpy')

    cuml_kmeans.fit(X)

    actual_score = cuml_kmeans.score(X)

    predictions = cuml_kmeans.predict(X)

    centers = cuml_kmeans.cluster_centers_

    expected_score = 0
    for idx, label in enumerate(predictions):
        x = X[idx]
        y = cp.array(centers[label])

        dist = cp.sqrt(cp.sum((x - y)**2))
        expected_score += dist**2

    assert actual_score + SCORE_EPS \
        >= (-1*expected_score) \
        >= actual_score - SCORE_EPS
예제 #8
0
def test_partial_fit(nrows, ncols, n_components, density, batch_size_divider,
                     whiten):

    X, _ = make_blobs(n_samples=nrows, n_features=ncols, random_state=10)

    cu_ipca = cuIPCA(n_components=n_components, whiten=whiten)

    sample_size = int(nrows / batch_size_divider)
    for i in range(0, nrows, sample_size):
        cu_ipca.partial_fit(X[i:i + sample_size].copy())

    cu_t = cu_ipca.transform(X)
    cu_inv = cu_ipca.inverse_transform(cu_t)

    sk_ipca = skIPCA(n_components=n_components, whiten=whiten)

    X = cp.asnumpy(X)

    for i in range(0, nrows, sample_size):
        sk_ipca.partial_fit(X[i:i + sample_size].copy())

    sk_t = sk_ipca.transform(X)
    sk_inv = sk_ipca.inverse_transform(sk_t)

    assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
예제 #9
0
def test_ivfpq_pred(nrows, ncols, n_neighbors, nlist, M, n_bits,
                    usePrecomputedTables):

    pytest.xfail("Warning: IVFPQ might be unstable in this "
                 "version of cuML. This is due to a known issue "
                 "in the FAISS release that this cuML version "
                 "is linked to. (see FAISS issue #1421)")

    algo_params = {
        'nlist': nlist,
        'nprobe': int(nlist * 0.2),
        'M': M,
        'n_bits': n_bits,
        'usePrecomputedTables': usePrecomputedTables
    }

    X, y = make_blobs(n_samples=nrows,
                      centers=5,
                      n_features=ncols,
                      random_state=0)

    knn_cu = cuKNN(algorithm="ivfpq", algo_params=algo_params)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #10
0
def create_rand_blobs(random_state):
    blobs, _ = make_blobs(n_samples=500,
                          n_features=20,
                          centers=20,
                          order='C',
                          random_state=random_state)
    return blobs
예제 #11
0
def test_fit(nrows, ncols, n_components, sparse_input, density, sparse_format,
             batch_size_divider, whiten):

    if sparse_format == 'csc':
        pytest.skip("cupyx.scipy.sparse.csc.csc_matrix does not support"
                    " indexing as of cupy 7.6.0")

    if sparse_input:
        X = cupyx.scipy.sparse.random(nrows,
                                      ncols,
                                      density=density,
                                      random_state=10,
                                      format=sparse_format)
    else:
        X, _ = make_blobs(n_samples=nrows, n_features=ncols, random_state=10)

    cu_ipca = cuIPCA(n_components=n_components,
                     whiten=whiten,
                     batch_size=int(nrows / batch_size_divider))
    cu_ipca.fit(X)
    cu_t = cu_ipca.transform(X)
    cu_inv = cu_ipca.inverse_transform(cu_t)

    sk_ipca = skIPCA(n_components=n_components,
                     whiten=whiten,
                     batch_size=int(nrows / batch_size_divider))
    if sparse_input:
        X = X.get()
    else:
        X = cp.asnumpy(X)
    sk_ipca.fit(X)
    sk_t = sk_ipca.transform(X)
    sk_inv = sk_ipca.inverse_transform(sk_t)

    assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
예제 #12
0
def test_ivfsq_pred(qtype, encodeResidual, nrows, ncols, n_neighbors, nlist):
    algo_params = {
        'nlist': nlist,
        'nprobe': nlist * 0.25,
        'qtype': qtype,
        'encodeResidual': encodeResidual
    }

    X, y = make_blobs(n_samples=nrows,
                      centers=5,
                      n_features=ncols,
                      random_state=0)

    logger.set_level(logger.level_debug)
    knn_cu = cuKNN(algorithm="ivfsq", algo_params=algo_params)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #13
0
def test_traditional_kmeans_plus_plus_init(nrows, ncols, nclusters,
                                           random_state):

    # Using fairly high variance between points in clusters
    cluster_std = 1.0

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)
    cu_score = cuml_kmeans.score(X)

    kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
    kmeans.fit(cp.asnumpy(X))
    sk_score = kmeans.score(cp.asnumpy(X))

    assert abs(cu_score - sk_score) <= cluster_std * 1.5
예제 #14
0
def test_knn_x_none(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows,
                      n_features=n_feats, random_state=0)

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
    knn_sk.fit(X.get())
    D_sk, I_sk = knn_sk.kneighbors(X=None, n_neighbors=k)

    X_orig = X

    if input_type == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(metric=metric, p=p, output_type="numpy")
    knn_cu.fit(X)
    D_cuml, I_cuml = knn_cu.kneighbors(X=None, n_neighbors=k)

    # Assert the cuml model was properly reverted
    cp.testing.assert_allclose(knn_cu.X_m, X_orig,
                               atol=1e-5, rtol=1e-4)

    # Allow a max relative diff of 10% and absolute diff of 1%
    cp.testing.assert_allclose(D_cuml, D_sk, atol=5e-2,
                               rtol=1e-1)
    assert I_cuml.all() == I_sk.all()
예제 #15
0
def test_score(nrows, ncols, nclusters, random_state):

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=1.0,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(init="k-means||",
                              n_clusters=nclusters,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)

    actual_score = cuml_kmeans.score(X)
    predictions = cuml_kmeans.predict(X)

    centers = cuml_kmeans.cluster_centers_

    expected_score = 0.0
    for idx, label in enumerate(predictions):
        x = X[idx, :]
        y = cp.array(centers[label, :], dtype=cp.float32)

        sq_euc_dist = cp.sum(cp.square((x - y)))
        expected_score += sq_euc_dist

    expected_score *= -1

    cp.testing.assert_allclose(actual_score,
                               expected_score,
                               atol=0.1,
                               rtol=1e-5)
예제 #16
0
def test_ivfpq_pred(nrows, ncols, n_neighbors, nlist, M, n_bits,
                    usePrecomputedTables):
    algo_params = {
        'nlist': nlist,
        'nprobe': int(nlist * 0.2),
        'M': M,
        'n_bits': n_bits,
        'usePrecomputedTables': usePrecomputedTables
    }

    X, y = make_blobs(n_samples=nrows,
                      centers=5,
                      n_features=ncols,
                      random_state=0)

    knn_cu = cuKNN(algorithm="ivfpq", algo_params=algo_params)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #17
0
파일: test_kmeans.py 프로젝트: thomcom/cuml
def test_kmeans_sequential_plus_plus_init(nrows, ncols, nclusters,
                                          random_state):

    # Using fairly high variance between points in clusters
    cluster_std = 1.0

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(verbose=0,
                              init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)
    cu_score = cuml_kmeans.score(X)

    kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
    kmeans.fit(X.copy_to_host())
    sk_score = kmeans.score(X.copy_to_host())

    assert abs(cu_score - sk_score) <= cluster_std * 1.5
예제 #18
0
파일: test_kmeans.py 프로젝트: thomcom/cuml
def test_n_init_cluster_consistency(random_state):

    cluster_std = 1.0

    nrows = 100000
    ncols = 100
    nclusters = 8

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)

    cuml_kmeans = cuml.KMeans(verbose=0,
                              init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)
    initial_clusters = cuml_kmeans.cluster_centers_

    cuml_kmeans = cuml.KMeans(verbose=0,
                              init="k-means++",
                              n_clusters=nclusters,
                              n_init=10,
                              random_state=random_state,
                              output_type='numpy')

    cuml_kmeans.fit(X)

    assert array_equal(initial_clusters, cuml_kmeans.cluster_centers_)
예제 #19
0
def test_lasso_attributes():
    X, y = make_blobs()
    clf = cumlLasso()
    clf.fit(X, y)

    attrs = ["dtype", "solver_model", "coef_", "intercept_",
             "solver_model", "l1_ratio", "n_cols"]
    for attr in attrs:
        assert hasattr(clf, attr)
예제 #20
0
def test_logistic_regression_attributes():
    X, y = make_blobs()
    clf = cuLog().fit(X, y, convert_dtype=True)

    attrs = ["dtype", "solver_model", "coef_", "intercept_",
             "l1_ratio", "n_cols", "C", "penalty",
             "fit_intercept", "solver"]

    for attr in attrs:
        assert hasattr(clf, attr)
예제 #21
0
def test_elastic_net_attributes():
    X, y = make_blobs()
    clf = cumlElastic(fit_intercept=False)
    clf.fit(X, y)

    attrs = ["dtype", "solver_model", "coef_", "intercept_",
             "l1_ratio", "n_cols", "alpha", "max_iter",
             "fit_intercept"]
    for attr in attrs:
        assert hasattr(clf, attr)
예제 #22
0
def test_mbsgd_classifier_attributes():
    X, y = make_blobs()
    clf = cumlMBSGClassifier()
    clf.fit(X, y)

    attrs = ["dtype", "solver_model", "coef_", "intercept_",
             "l1_ratio", "n_cols", "eta0", "batch_size",
             "fit_intercept", "penalty"]
    for attr in attrs:
        assert hasattr(clf, attr)
예제 #23
0
def test_mbsgd_regressor_attributes():
    X, y = make_blobs()
    clf = cumlMBSGRegressor()
    clf.fit(X, y)

    attrs = ["dtype", "solver_model", "coef_", "intercept_",
             "l1_ratio", "n_cols", "loss", "eta0", "batch_size",
             "epochs"]
    for attr in attrs:
        assert hasattr(clf, attr)
예제 #24
0
def test_self_neighboring(datatype, metric_p, nrows):
    """Test that searches using an indexed vector itself return sensible
    results for that vector

    For L2-derived metrics, this specifically exercises the slow high-precision
    mode used to correct for approximation errors in L2 computation during NN
    searches.
    """
    ncols = 1000
    n_clusters = 10
    n_neighbors = 3

    metric, p = metric_p

    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(metric=metric, n_neighbors=n_neighbors)
    knn_cu.fit(X)
    neigh_dist, neigh_ind = knn_cu.kneighbors(X,
                                              n_neighbors=n_neighbors,
                                              return_distance=True,
                                              two_pass_precision=True)

    if datatype == 'dataframe':
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.to_numpy()
        neigh_dist = neigh_dist.to_numpy()
    else:
        assert isinstance(neigh_ind, cp.ndarray)
        neigh_ind = neigh_ind.get()
        neigh_dist = neigh_dist.get()

    neigh_ind = neigh_ind[:, 0]
    neigh_dist = neigh_dist[:, 0]

    assert_array_equal(
        neigh_ind,
        np.arange(0, neigh_dist.shape[0]),
    )
    assert_allclose(neigh_dist,
                    np.zeros(neigh_dist.shape, dtype=neigh_dist.dtype),
                    atol=1e-4)
예제 #25
0
def get_data_consistency_test():
    cluster_std = 1.0
    nrows = 1000
    ncols = 50
    nclusters = 8

    X, y = make_blobs(nrows,
                      ncols,
                      nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=0)
    return X, y
예제 #26
0
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    X_index = X[:100]
    X_search = X[101:]

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
    knn_sk.fit(X_index.get())
    D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k)

    X_orig = X_index

    if input_type == "dataframe":
        X_index = cudf.DataFrame(X_index)
        X_search = cudf.DataFrame(X_search)

    knn_cu = cuKNN(metric=metric, p=p)
    knn_cu.fit(X_index)
    D_cuml, I_cuml = knn_cu.kneighbors(X_search, k)

    if input_type == "dataframe":
        assert isinstance(D_cuml, cudf.DataFrame)
        assert isinstance(I_cuml, cudf.DataFrame)
        D_cuml_np = D_cuml.to_numpy()
        I_cuml_np = I_cuml.to_numpy()
    else:
        assert isinstance(D_cuml, cp.ndarray)
        assert isinstance(I_cuml, cp.ndarray)
        D_cuml_np = D_cuml.get()
        I_cuml_np = I_cuml.get()

    with cuml.using_output_type("numpy"):
        # Assert the cuml model was properly reverted
        np.testing.assert_allclose(knn_cu.X_m,
                                   X_orig.get(),
                                   atol=1e-3,
                                   rtol=1e-3)

    if metric == 'braycurtis':
        diff = D_cuml_np - D_sk
        # Braycurtis has a few differences, but this is computed by FAISS.
        # So long as the indices all match below, the small discrepancy
        # should be okay.
        assert len(diff[diff > 1e-2]) / X_search.shape[0] < 0.06
    else:
        np.testing.assert_allclose(D_cuml_np, D_sk, atol=1e-3, rtol=1e-3)
    assert I_cuml_np.all() == I_sk.all()
예제 #27
0
def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors,
                              precomputed_nearest_neighbors):
    n_clusters = 30
    random_state = 42
    metric = 'euclidean'

    X, _ = make_blobs(n_samples=n_rows,
                      centers=n_clusters,
                      n_features=n_features,
                      random_state=random_state)

    if precomputed_nearest_neighbors:
        nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric)
        nn.fit(X)
        knn_dists, knn_indices = nn.kneighbors(X,
                                               n_neighbors,
                                               return_distance=True)
        cu_fss_graph = cu_fuzzy_simplicial_set(X,
                                               n_neighbors,
                                               random_state,
                                               metric,
                                               knn_indices=knn_indices,
                                               knn_dists=knn_dists)

        knn_indices = knn_indices.get()
        knn_dists = knn_dists.get()
        ref_fss_graph = ref_fuzzy_simplicial_set(
            X,
            n_neighbors,
            random_state,
            metric,
            knn_indices=knn_indices,
            knn_dists=knn_dists)[0].tocoo()
    else:
        cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state,
                                               metric)

        X = X.get()
        ref_fss_graph = ref_fuzzy_simplicial_set(X, n_neighbors, random_state,
                                                 metric)[0].tocoo()

    cu_fss_graph = cu_fss_graph.todense()
    ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense()
    assert correctness_sparse(ref_fss_graph,
                              cu_fss_graph,
                              atol=0.1,
                              rtol=0.2,
                              threshold=0.95)
예제 #28
0
def test_kmeans_clusters_blobs(nrows, ncols, nclusters,
                               random_state, cluster_std):

    X, y = make_blobs(int(nrows), ncols, nclusters,
                      cluster_std=cluster_std,
                      shuffle=False,
                      random_state=random_state,)

    cuml_kmeans = cuml.KMeans(init="k-means||",
                              n_clusters=nclusters,
                              random_state=random_state,
                              output_type='numpy')

    preds = cuml_kmeans.fit_predict(X)

    assert adjusted_rand_score(cp.asnumpy(preds), cp.asnumpy(y)) >= 0.99
예제 #29
0
def test_return_dists():
    n_samples = 50
    n_feats = 50
    k = 5

    X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0)

    knn_cu = cuKNN()
    knn_cu.fit(X)

    ret = knn_cu.kneighbors(X, k, return_distance=False)
    assert not isinstance(ret, tuple)
    assert ret.shape == (n_samples, k)

    ret = knn_cu.kneighbors(X, k, return_distance=True)
    assert isinstance(ret, tuple)
    assert len(ret) == 2
예제 #30
0
def test_ann_distances_metrics(algo, metric):
    X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0)

    cu_knn = cuKNN(algorithm=algo, metric=metric)
    cu_knn.fit(X)
    cu_dist, cu_ind = cu_knn.kneighbors(X,
                                        n_neighbors=10,
                                        return_distance=True)
    del cu_knn
    gc.collect()

    X = X.get()
    sk_knn = skKNN(metric=metric)
    sk_knn.fit(X)
    sk_dist, sk_ind = sk_knn.kneighbors(X,
                                        n_neighbors=10,
                                        return_distance=True)

    return array_equal(sk_dist, cu_dist)