예제 #1
0
def test_haversine_fails_high_dimensions():

    data = np.array([[0., 1., 2.], [3., 4., 5.]])

    cunn = cuKNN(metric='haversine', n_neighbors=2, algorithm='brute')

    cunn.fit(data).kneighbors(data)
예제 #2
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype, algo):
    if algo == "ivfpq":
        pytest.xfail("""See Memory access error in IVFPQ :
                        https://github.com/rapidsai/cuml/issues/3318""")

    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(algorithm=algo)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(neigh_ind, cp.core.core.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #3
0
def test_ivfpq_pred(nrows, ncols, n_neighbors, nlist, M, n_bits,
                    usePrecomputedTables):
    algo_params = {
        'nlist': nlist,
        'nprobe': int(nlist * 0.2),
        'M': M,
        'n_bits': n_bits,
        'usePrecomputedTables': usePrecomputedTables
    }

    X, y = make_blobs(n_samples=nrows,
                      centers=5,
                      n_features=ncols,
                      random_state=0)

    knn_cu = cuKNN(algorithm="ivfpq", algo_params=algo_params)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #4
0
def test_cuml_against_sklearn(input_type, nrows, n_feats, k):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    knn_sk = skKNN(metric="euclidean")
    knn_sk.fit(X)
    D_sk, I_sk = knn_sk.kneighbors(X, k)

    if input_type == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))

    knn_cu = cuKNN()
    knn_cu.fit(X)
    D_cuml, I_cuml = knn_cu.kneighbors(X, k)

    if input_type == "dataframe":
        assert isinstance(D_cuml, cudf.DataFrame)
        assert isinstance(I_cuml, cudf.DataFrame)
        D_cuml_arr = D_cuml.as_gpu_matrix().copy_to_host()
        I_cuml_arr = I_cuml.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(D_cuml, np.ndarray)
        assert isinstance(I_cuml, np.ndarray)
        D_cuml_arr = D_cuml
        I_cuml_arr = I_cuml

    assert array_equal(D_cuml_arr, D_sk, 1e-2, with_sign=True)
    assert I_cuml_arr.all() == I_sk.all()
예제 #5
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype):

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    X = X.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))

    knn_cu = cuKNN()
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(neigh_ind, np.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #6
0
def test_ivfsq_pred(qtype, encodeResidual, nrows, ncols, n_neighbors, nlist):
    algo_params = {
        'nlist': nlist,
        'nprobe': nlist * 0.25,
        'qtype': qtype,
        'encodeResidual': encodeResidual
    }

    X, y = make_blobs(n_samples=nrows,
                      centers=5,
                      n_features=ncols,
                      random_state=0)

    logger.set_level(logger.level_debug)
    knn_cu = cuKNN(algorithm="ivfsq", algo_params=algo_params)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #7
0
def test_ivfpq_pred(nrows, ncols, n_neighbors, nlist, M, n_bits,
                    usePrecomputedTables):

    pytest.xfail("Warning: IVFPQ might be unstable in this "
                 "version of cuML. This is due to a known issue "
                 "in the FAISS release that this cuML version "
                 "is linked to. (see FAISS issue #1421)")

    algo_params = {
        'nlist': nlist,
        'nprobe': int(nlist * 0.2),
        'M': M,
        'n_bits': n_bits,
        'usePrecomputedTables': usePrecomputedTables
    }

    X, y = make_blobs(n_samples=nrows,
                      centers=5,
                      n_features=ncols,
                      random_state=0)

    knn_cu = cuKNN(algorithm="ivfpq", algo_params=algo_params)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #8
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype):
    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    X = X.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN()
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(neigh_ind, np.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #9
0
def test_haversine(n_neighbors):

    hoboken_nj = [40.745255, -74.034775]
    port_hueneme_ca = [34.155834, -119.202789]
    auburn_ny = [42.933334, -76.566666]
    league_city_tx = [29.499722, -95.089722]
    tallahassee_fl = [30.455000, -84.253334]
    aurora_il = [41.763889, -88.29001]

    data = np.array([
        hoboken_nj, port_hueneme_ca, auburn_ny, league_city_tx, tallahassee_fl,
        aurora_il
    ])

    data = data * math.pi / 180

    pw_dists = pairwise_distances(data, metric='haversine')

    cunn = cuKNN(metric='haversine',
                 n_neighbors=n_neighbors,
                 algorithm='brute')

    dists, inds = cunn.fit(data).kneighbors(data)

    argsort = np.argsort(pw_dists, axis=1)

    for i in range(pw_dists.shape[0]):
        cpu_ordered = pw_dists[i, argsort[i]]
        cp.testing.assert_allclose(cpu_ordered[:n_neighbors],
                                   dists[i],
                                   atol=1e-4,
                                   rtol=1e-4)
예제 #10
0
def test_nn_downcast_fails(input_type, nrows, n_feats):
    X, y = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    knn_cu = cuKNN()
    if input_type == 'dataframe':
        X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X_pd)
        knn_cu.fit(X_cudf, convert_dtype=True)

    with pytest.raises(Exception):
        knn_cu.fit(X, convert_dtype=False)

    # Test fit() fails when downcast corrupted data
    X = np.array([[np.finfo(np.float32).max]], dtype=np.float64)
    knn_cu = cuKNN()
    with pytest.raises(Exception):
        knn_cu.fit(X, convert_dtype=False)
예제 #11
0
def test_nn_downcast_fails(input_type):

    X = np.array([[1.0], [50.0], [51.0]], dtype=np.float64)

    # Test fit() fails with double precision when should_downcast set to False
    knn_cu = cuKNN()
    if input_type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))

    with pytest.raises(Exception):
        knn_cu.fit(X, should_downcast=False)

    # Test fit() fails when downcast corrupted data
    X = np.array([[np.finfo(np.float32).max]], dtype=np.float64)

    knn_cu = cuKNN()
    if input_type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))

    with pytest.raises(Exception):
        knn_cu.fit(X, should_downcast=True)
예제 #12
0
def test_nonmonotonic_labels():

    X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)

    y = np.array([15, 5]).astype(np.int32)

    knn_cu = cuKNN(n_neighbors=1)
    knn_cu.fit(X, y)

    p = knn_cu.predict(X)

    assert array_equal(p.astype(np.int32), y)
예제 #13
0
def test_self_neighboring(datatype, metric_p, nrows):
    """Test that searches using an indexed vector itself return sensible
    results for that vector

    For L2-derived metrics, this specifically exercises the slow high-precision
    mode used to correct for approximation errors in L2 computation during NN
    searches.
    """
    ncols = 1000
    n_clusters = 10
    n_neighbors = 3

    metric, p = metric_p

    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(metric=metric, n_neighbors=n_neighbors)
    knn_cu.fit(X)
    neigh_dist, neigh_ind = knn_cu.kneighbors(X,
                                              n_neighbors=n_neighbors,
                                              return_distance=True,
                                              two_pass_precision=True)

    if datatype == 'dataframe':
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.to_numpy()
        neigh_dist = neigh_dist.to_numpy()
    else:
        assert isinstance(neigh_ind, cp.ndarray)
        neigh_ind = neigh_ind.get()
        neigh_dist = neigh_dist.get()

    neigh_ind = neigh_ind[:, 0]
    neigh_dist = neigh_dist[:, 0]

    assert_array_equal(
        neigh_ind,
        np.arange(0, neigh_dist.shape[0]),
    )
    assert_allclose(neigh_dist,
                    np.zeros(neigh_dist.shape, dtype=neigh_dist.dtype),
                    atol=1e-4)
예제 #14
0
def test_score(nrows, ncols, n_neighbors, n_clusters, datatype):

    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      n_features=ncols, random_state=0,
                      cluster_std=0.01)

    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype)

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X_train, y_train)

    assert knn_cu.score(X_test, y_test) >= (1.0 - 0.004)
예제 #15
0
def test_tsne_knn_graph_used(dataset, type_knn_graph, method):

    X = dataset.data

    neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(X)
    knn_graph = neigh.kneighbors_graph(X, mode="distance").astype('float32')

    if type_knn_graph == 'cuml':
        knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph)

    tsne = TSNE(random_state=1,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                method=method,
                perplexity=DEFAULT_PERPLEXITY,
                learning_rate_method='none',
                min_grad_norm=1e-12)

    # Perform tsne with normal knn_graph
    Y = tsne.fit_transform(X, True, knn_graph)

    trust_normal = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS)

    X_garbage = np.ones(X.shape)
    knn_graph_garbage = neigh.kneighbors_graph(
        X_garbage, mode="distance").astype('float32')

    if type_knn_graph == 'cuml':
        knn_graph_garbage = cupyx.scipy.sparse.csr_matrix(knn_graph_garbage)

    tsne = TSNE(random_state=1,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                method=method,
                perplexity=DEFAULT_PERPLEXITY,
                learning_rate_method='none',
                min_grad_norm=1e-12)

    # Perform tsne with garbage knn_graph
    Y = tsne.fit_transform(X, True, knn_graph_garbage)

    trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS)
    assert (trust_normal - trust_garbage) > 0.15

    Y = tsne.fit_transform(X, True, knn_graph_garbage)
    trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS)
    assert (trust_normal - trust_garbage) > 0.15

    Y = tsne.fit_transform(X, True, knn_graph_garbage)
    trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS)
    assert (trust_normal - trust_garbage) > 0.15
예제 #16
0
def test_nearest_neighbors_rbc(distance, n_neighbors, nrows):
    X, y = make_blobs(n_samples=nrows,
                      centers=25,
                      shuffle=True,
                      n_features=2,
                      cluster_std=3.0,
                      random_state=42)

    knn_cu = cuKNN(metric=distance, algorithm="rbc")
    knn_cu.fit(X)

    query_rows = int(nrows / 2)

    rbc_d, rbc_i = knn_cu.kneighbors(X[:query_rows, :],
                                     n_neighbors=n_neighbors)

    if distance == 'euclidean':
        # Need to use unexpanded euclidean distance
        pw_dists = cuPW(X, metric="l2")
        brute_i = cp.argsort(pw_dists, axis=1)[:query_rows, :n_neighbors]
        brute_d = cp.sort(pw_dists, axis=1)[:query_rows, :n_neighbors]
    else:
        knn_cu_brute = cuKNN(metric=distance, algorithm="brute")
        knn_cu_brute.fit(X)

        brute_d, brute_i = knn_cu_brute.kneighbors(X[:query_rows, :],
                                                   n_neighbors=n_neighbors)

    rbc_i = cp.sort(rbc_i, axis=1)
    brute_i = cp.sort(brute_i, axis=1)

    # TODO: These are failing with 1 or 2 mismatched elements
    # for very small values of k:
    # https://github.com/rapidsai/cuml/issues/4262
    assert len(brute_d[brute_d != rbc_d]) <= 1
    assert len(brute_i[brute_i != rbc_i]) <= 1
예제 #17
0
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    X_index = X[:100]
    X_search = X[101:]

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
    knn_sk.fit(X_index.get())
    D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k)

    X_orig = X_index

    if input_type == "dataframe":
        X_index = cudf.DataFrame(X_index)
        X_search = cudf.DataFrame(X_search)

    knn_cu = cuKNN(metric=metric, p=p)
    knn_cu.fit(X_index)
    D_cuml, I_cuml = knn_cu.kneighbors(X_search, k)

    if input_type == "dataframe":
        assert isinstance(D_cuml, cudf.DataFrame)
        assert isinstance(I_cuml, cudf.DataFrame)
        D_cuml_np = D_cuml.to_numpy()
        I_cuml_np = I_cuml.to_numpy()
    else:
        assert isinstance(D_cuml, cp.ndarray)
        assert isinstance(I_cuml, cp.ndarray)
        D_cuml_np = D_cuml.get()
        I_cuml_np = I_cuml.get()

    with cuml.using_output_type("numpy"):
        # Assert the cuml model was properly reverted
        np.testing.assert_allclose(knn_cu.X_m,
                                   X_orig.get(),
                                   atol=1e-3,
                                   rtol=1e-3)

    if metric == 'braycurtis':
        diff = D_cuml_np - D_sk
        # Braycurtis has a few differences, but this is computed by FAISS.
        # So long as the indices all match below, the small discrepancy
        # should be okay.
        assert len(diff[diff > 1e-2]) / X_search.shape[0] < 0.06
    else:
        np.testing.assert_allclose(D_cuml_np, D_sk, atol=1e-3, rtol=1e-3)
    assert I_cuml_np.all() == I_sk.all()
예제 #18
0
def test_score(nrows, ncols, n_neighbors, n_clusters, datatype):

    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      n_features=ncols, random_state=0,
                      cluster_std=0.01)

    X = X.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y.reshape(nrows, 1)))

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X, y)

    assert knn_cu.score(X, y) >= (1.0 - 0.004)
예제 #19
0
def test_score_dtype(dtype):
    # Using make_blobs here to check averages and neighborhoods
    X, y = make_blobs(n_samples=1000,
                      centers=2,
                      cluster_std=0.01,
                      n_features=50,
                      random_state=0)

    X = X.astype(dtype)
    y = y.astype(dtype)

    knn_cu = cuKNN(n_neighbors=5)
    knn_cu.fit(X, y)
    pred = knn_cu.predict(X)
    assert pred.dtype == dtype
    assert knn_cu.score(X, y) >= 0.9999
예제 #20
0
def test_knn_return_cumlarray(input_type):
    n_samples = 50
    n_feats = 50
    k = 5

    X, _ = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0)

    if input_type == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN()
    knn_cu.fit(X)
    indices, distances = knn_cu._kneighbors(X, k, _output_cumlarray=True)

    assert isinstance(indices, CumlArray)
    assert isinstance(distances, CumlArray)
예제 #21
0
def test_nearest_neighbors_sparse(shape,
                                  metric,
                                  n_neighbors,
                                  batch_size_index,
                                  batch_size_query):

    nrows, ncols, density = shape

    if nrows == 1 and n_neighbors > 1:
        return

    a = cp.sparse.random(nrows, ncols, format='csr', density=density,
                         random_state=35)
    b = cp.sparse.random(nrows, ncols, format='csr', density=density,
                         random_state=38)

    if metric == 'jaccard':
        a = a.astype('bool').astype('float32')
        b = b.astype('bool').astype('float32')

    logger.set_level(logger.level_debug)
    nn = cuKNN(metric=metric, p=2.0, n_neighbors=n_neighbors,
               algorithm="brute", output_type="numpy",
               verbose=logger.level_debug,
               algo_params={"batch_size_index": batch_size_index,
                            "batch_size_query": batch_size_query})
    nn.fit(a)

    cuD, cuI = nn.kneighbors(b)

    if metric not in sklearn.neighbors.VALID_METRICS_SPARSE['brute']:
        a = a.todense()
        b = b.todense()

    sknn = skKNN(metric=metric, p=2.0, n_neighbors=n_neighbors,
                 algorithm="brute", n_jobs=-1)
    sk_X = a.get()
    sknn.fit(sk_X)

    skD, skI = sknn.kneighbors(b.get())

    cp.testing.assert_allclose(cuD, skD, atol=1e-3, rtol=1e-3)

    # Jaccard & Chebyshev have a high potential for mismatched indices
    # due to duplicate distances. We can ignore the indices in this case.
    if metric not in ['jaccard', 'chebyshev']:
        cp.testing.assert_allclose(cuI, skI, atol=1e-4, rtol=1e-4)
예제 #22
0
def test_return_dists():
    n_samples = 50
    n_feats = 50
    k = 5

    X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0)

    knn_cu = cuKNN()
    knn_cu.fit(X)

    ret = knn_cu.kneighbors(X, k, return_distance=False)
    assert not isinstance(ret, tuple)
    assert ret.shape == (n_samples, k)

    ret = knn_cu.kneighbors(X, k, return_distance=True)
    assert isinstance(ret, tuple)
    assert len(ret) == 2
예제 #23
0
def test_score(nrows, ncols, n_neighbors, n_clusters, datatype):

    # Using make_blobs here to check averages and neighborhoods
    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      cluster_std=0.01,
                      n_features=ncols, random_state=0)

    X = X.astype(np.float32)
    y = y.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y.reshape(nrows, 1)))

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X, y)

    assert knn_cu.score(X, y) >= 0.9999
예제 #24
0
def test_kneighbors_regressor(n_samples=40,
                              n_features=5,
                              n_test_pts=10,
                              n_neighbors=3,
                              random_state=0):
    # Test k-neighbors regression
    rng = np.random.RandomState(random_state)
    X = 2 * rng.rand(n_samples, n_features) - 1
    y = np.sqrt((X ** 2).sum(1))
    y /= y.max()

    y_target = y[:n_test_pts]

    knn = cuKNN(n_neighbors=n_neighbors)
    knn.fit(X, y)
    epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1)
    y_pred = knn.predict(X[:n_test_pts] + epsilon)
    assert np.all(abs(y_pred - y_target) < 0.3)
예제 #25
0
파일: test_tsne.py 프로젝트: st071300/cuML
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type):

    datasets
    digits = datasets.load_digits()

    neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \
        else cuKNN(n_neighbors=90)

    digits_selection = np.random.RandomState(42).choice(
        [True, False], 1797, replace=True, p=[0.60, 0.40])

    selected_digits = digits.data[~digits_selection]

    neigh.fit(selected_digits)
    knn_graph = neigh.kneighbors_graph(selected_digits, mode="distance")

    if input_type == 'cupy':
        sp_prefix = cupyx.scipy.sparse
    else:
        sp_prefix = scipy.sparse

    tsne = TSNE(2, n_neighbors=15,
                random_state=1,
                learning_rate=500,
                angle=0.8)

    new_data = sp_prefix.csr_matrix(
        scipy.sparse.csr_matrix(selected_digits))

    Y = tsne.fit_transform(new_data, True, knn_graph)
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocoo())
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocsc())
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)
    del Y
예제 #26
0
def test_ivfflat_pred(nrows, ncols, n_neighbors, nlist):
    algo_params = {'nlist': nlist, 'nprobe': nlist * 0.25}

    X, y = make_blobs(n_samples=nrows,
                      centers=5,
                      n_features=ncols,
                      random_state=0)

    knn_cu = cuKNN(algorithm="ivfflat", algo_params=algo_params)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
예제 #27
0
def test_ann_distances_metrics(algo, metric):
    X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0)

    cu_knn = cuKNN(algorithm=algo, metric=metric)
    cu_knn.fit(X)
    cu_dist, cu_ind = cu_knn.kneighbors(X,
                                        n_neighbors=10,
                                        return_distance=True)
    del cu_knn
    gc.collect()

    X = X.get()
    sk_knn = skKNN(metric=metric)
    sk_knn.fit(X)
    sk_dist, sk_ind = sk_knn.kneighbors(X,
                                        n_neighbors=10,
                                        return_distance=True)

    return array_equal(sk_dist, cu_dist)
예제 #28
0
def test_predict_multioutput(datatype):

    X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)
    y = np.array([[15, 2], [5, 4]]).astype(np.int32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y))

    knn_cu = cuKNN(n_neighbors=1)
    knn_cu.fit(X, y)

    p = knn_cu.predict(X)

    if datatype == "dataframe":
        assert isinstance(p, cudf.DataFrame)
    else:
        assert isinstance(p, np.ndarray)

    assert array_equal(p.astype(np.int32), y)
예제 #29
0
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type, method):

    digits = test_datasets["digits"].data

    neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS,
                  metric="euclidean").fit(digits)
    knn_graph = neigh.kneighbors_graph(digits,
                                       mode="distance").astype('float32')

    if type_knn_graph == 'cuml':
        knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph)

    if input_type == 'cupy':
        sp_prefix = cupyx.scipy.sparse
    else:
        sp_prefix = scipy.sparse

    tsne = TSNE(n_components=2,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                random_state=1,
                learning_rate_method='none',
                method=method,
                min_grad_norm=1e-12,
                perplexity=DEFAULT_PERPLEXITY)

    new_data = sp_prefix.csr_matrix(scipy.sparse.csr_matrix(digits))

    Y = tsne.fit_transform(new_data, True, knn_graph)
    if input_type == 'cupy':
        Y = Y.get()
    validate_embedding(digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocoo())
    if input_type == 'cupy':
        Y = Y.get()
    validate_embedding(digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocsc())
    if input_type == 'cupy':
        Y = Y.get()
    validate_embedding(digits, Y, 0.85)
예제 #30
0
def test_knn_graph(input_type, mode, output_type, as_instance, nrows, n_feats,
                   p, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    if as_instance:
        sparse_sk = sklearn.neighbors.kneighbors_graph(X.get(),
                                                       k,
                                                       mode=mode,
                                                       metric=metric,
                                                       p=p,
                                                       include_self='auto')
    else:
        knn_sk = skKNN(metric=metric, p=p)
        knn_sk.fit(X.get())
        sparse_sk = knn_sk.kneighbors_graph(X.get(), k, mode=mode)

    if input_type == "dataframe":
        X = cudf.DataFrame(X)

    with cuml.using_output_type(output_type):
        if as_instance:
            sparse_cu = cuml.neighbors.kneighbors_graph(X,
                                                        k,
                                                        mode=mode,
                                                        metric=metric,
                                                        p=p,
                                                        include_self='auto')
        else:
            knn_cu = cuKNN(metric=metric, p=p)
            knn_cu.fit(X)
            sparse_cu = knn_cu.kneighbors_graph(X, k, mode=mode)

    assert np.array_equal(sparse_sk.data.shape, sparse_cu.data.shape)
    assert np.array_equal(sparse_sk.indices.shape, sparse_cu.indices.shape)
    assert np.array_equal(sparse_sk.indptr.shape, sparse_cu.indptr.shape)
    assert np.array_equal(sparse_sk.toarray().shape, sparse_cu.toarray().shape)

    if output_type == 'cupy' or output_type is None:
        assert cupyx.scipy.sparse.isspmatrix_csr(sparse_cu)
    else:
        assert isspmatrix_csr(sparse_cu)