def test_haversine_fails_high_dimensions(): data = np.array([[0., 1., 2.], [3., 4., 5.]]) cunn = cuKNN(metric='haversine', n_neighbors=2, algorithm='brute') cunn.fit(data).kneighbors(data)
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype, algo): if algo == "ivfpq": pytest.xfail("""See Memory access error in IVFPQ : https://github.com/rapidsai/cuml/issues/3318""") if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(algorithm=algo) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host() else: assert isinstance(neigh_ind, cp.core.core.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_ivfpq_pred(nrows, ncols, n_neighbors, nlist, M, n_bits, usePrecomputedTables): algo_params = { 'nlist': nlist, 'nprobe': int(nlist * 0.2), 'M': M, 'n_bits': n_bits, 'usePrecomputedTables': usePrecomputedTables } X, y = make_blobs(n_samples=nrows, centers=5, n_features=ncols, random_state=0) knn_cu = cuKNN(algorithm="ivfpq", algo_params=algo_params) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_cuml_against_sklearn(input_type, nrows, n_feats, k): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) knn_sk = skKNN(metric="euclidean") knn_sk.fit(X) D_sk, I_sk = knn_sk.kneighbors(X, k) if input_type == "dataframe": X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X)) knn_cu = cuKNN() knn_cu.fit(X) D_cuml, I_cuml = knn_cu.kneighbors(X, k) if input_type == "dataframe": assert isinstance(D_cuml, cudf.DataFrame) assert isinstance(I_cuml, cudf.DataFrame) D_cuml_arr = D_cuml.as_gpu_matrix().copy_to_host() I_cuml_arr = I_cuml.as_gpu_matrix().copy_to_host() else: assert isinstance(D_cuml, np.ndarray) assert isinstance(I_cuml, np.ndarray) D_cuml_arr = D_cuml I_cuml_arr = I_cuml assert array_equal(D_cuml_arr, D_sk, 1e-2, with_sign=True) assert I_cuml_arr.all() == I_sk.all()
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype): X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) X = X.astype(np.float32) if datatype == "dataframe": X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X)) knn_cu = cuKNN() knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host() else: assert isinstance(neigh_ind, np.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_ivfsq_pred(qtype, encodeResidual, nrows, ncols, n_neighbors, nlist): algo_params = { 'nlist': nlist, 'nprobe': nlist * 0.25, 'qtype': qtype, 'encodeResidual': encodeResidual } X, y = make_blobs(n_samples=nrows, centers=5, n_features=ncols, random_state=0) logger.set_level(logger.level_debug) knn_cu = cuKNN(algorithm="ivfsq", algo_params=algo_params) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_ivfpq_pred(nrows, ncols, n_neighbors, nlist, M, n_bits, usePrecomputedTables): pytest.xfail("Warning: IVFPQ might be unstable in this " "version of cuML. This is due to a known issue " "in the FAISS release that this cuML version " "is linked to. (see FAISS issue #1421)") algo_params = { 'nlist': nlist, 'nprobe': int(nlist * 0.2), 'M': M, 'n_bits': n_bits, 'usePrecomputedTables': usePrecomputedTables } X, y = make_blobs(n_samples=nrows, centers=5, n_features=ncols, random_state=0) knn_cu = cuKNN(algorithm="ivfpq", algo_params=algo_params) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype): if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) X = X.astype(np.float32) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN() knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host() else: assert isinstance(neigh_ind, np.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_haversine(n_neighbors): hoboken_nj = [40.745255, -74.034775] port_hueneme_ca = [34.155834, -119.202789] auburn_ny = [42.933334, -76.566666] league_city_tx = [29.499722, -95.089722] tallahassee_fl = [30.455000, -84.253334] aurora_il = [41.763889, -88.29001] data = np.array([ hoboken_nj, port_hueneme_ca, auburn_ny, league_city_tx, tallahassee_fl, aurora_il ]) data = data * math.pi / 180 pw_dists = pairwise_distances(data, metric='haversine') cunn = cuKNN(metric='haversine', n_neighbors=n_neighbors, algorithm='brute') dists, inds = cunn.fit(data).kneighbors(data) argsort = np.argsort(pw_dists, axis=1) for i in range(pw_dists.shape[0]): cpu_ordered = pw_dists[i, argsort[i]] cp.testing.assert_allclose(cpu_ordered[:n_neighbors], dists[i], atol=1e-4, rtol=1e-4)
def test_nn_downcast_fails(input_type, nrows, n_feats): X, y = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) knn_cu = cuKNN() if input_type == 'dataframe': X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_pd) knn_cu.fit(X_cudf, convert_dtype=True) with pytest.raises(Exception): knn_cu.fit(X, convert_dtype=False) # Test fit() fails when downcast corrupted data X = np.array([[np.finfo(np.float32).max]], dtype=np.float64) knn_cu = cuKNN() with pytest.raises(Exception): knn_cu.fit(X, convert_dtype=False)
def test_nn_downcast_fails(input_type): X = np.array([[1.0], [50.0], [51.0]], dtype=np.float64) # Test fit() fails with double precision when should_downcast set to False knn_cu = cuKNN() if input_type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X)) with pytest.raises(Exception): knn_cu.fit(X, should_downcast=False) # Test fit() fails when downcast corrupted data X = np.array([[np.finfo(np.float32).max]], dtype=np.float64) knn_cu = cuKNN() if input_type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X)) with pytest.raises(Exception): knn_cu.fit(X, should_downcast=True)
def test_nonmonotonic_labels(): X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32) y = np.array([15, 5]).astype(np.int32) knn_cu = cuKNN(n_neighbors=1) knn_cu.fit(X, y) p = knn_cu.predict(X) assert array_equal(p.astype(np.int32), y)
def test_self_neighboring(datatype, metric_p, nrows): """Test that searches using an indexed vector itself return sensible results for that vector For L2-derived metrics, this specifically exercises the slow high-precision mode used to correct for approximation errors in L2 computation during NN searches. """ ncols = 1000 n_clusters = 10 n_neighbors = 3 metric, p = metric_p if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(metric=metric, n_neighbors=n_neighbors) knn_cu.fit(X) neigh_dist, neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=True, two_pass_precision=True) if datatype == 'dataframe': assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.to_numpy() neigh_dist = neigh_dist.to_numpy() else: assert isinstance(neigh_ind, cp.ndarray) neigh_ind = neigh_ind.get() neigh_dist = neigh_dist.get() neigh_ind = neigh_ind[:, 0] neigh_dist = neigh_dist[:, 0] assert_array_equal( neigh_ind, np.arange(0, neigh_dist.shape[0]), ) assert_allclose(neigh_dist, np.zeros(neigh_dist.shape, dtype=neigh_dist.dtype), atol=1e-4)
def test_score(nrows, ncols, n_neighbors, n_clusters, datatype): X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0, cluster_std=0.01) X = X.astype(np.float32) X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype) knn_cu = cuKNN(n_neighbors=n_neighbors) knn_cu.fit(X_train, y_train) assert knn_cu.score(X_test, y_test) >= (1.0 - 0.004)
def test_tsne_knn_graph_used(dataset, type_knn_graph, method): X = dataset.data neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph) tsne = TSNE(random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, method=method, perplexity=DEFAULT_PERPLEXITY, learning_rate_method='none', min_grad_norm=1e-12) # Perform tsne with normal knn_graph Y = tsne.fit_transform(X, True, knn_graph) trust_normal = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) X_garbage = np.ones(X.shape) knn_graph_garbage = neigh.kneighbors_graph( X_garbage, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph_garbage = cupyx.scipy.sparse.csr_matrix(knn_graph_garbage) tsne = TSNE(random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, method=method, perplexity=DEFAULT_PERPLEXITY, learning_rate_method='none', min_grad_norm=1e-12) # Perform tsne with garbage knn_graph Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15
def test_nearest_neighbors_rbc(distance, n_neighbors, nrows): X, y = make_blobs(n_samples=nrows, centers=25, shuffle=True, n_features=2, cluster_std=3.0, random_state=42) knn_cu = cuKNN(metric=distance, algorithm="rbc") knn_cu.fit(X) query_rows = int(nrows / 2) rbc_d, rbc_i = knn_cu.kneighbors(X[:query_rows, :], n_neighbors=n_neighbors) if distance == 'euclidean': # Need to use unexpanded euclidean distance pw_dists = cuPW(X, metric="l2") brute_i = cp.argsort(pw_dists, axis=1)[:query_rows, :n_neighbors] brute_d = cp.sort(pw_dists, axis=1)[:query_rows, :n_neighbors] else: knn_cu_brute = cuKNN(metric=distance, algorithm="brute") knn_cu_brute.fit(X) brute_d, brute_i = knn_cu_brute.kneighbors(X[:query_rows, :], n_neighbors=n_neighbors) rbc_i = cp.sort(rbc_i, axis=1) brute_i = cp.sort(brute_i, axis=1) # TODO: These are failing with 1 or 2 mismatched elements # for very small values of k: # https://github.com/rapidsai/cuml/issues/4262 assert len(brute_d[brute_d != rbc_d]) <= 1 assert len(brute_i[brute_i != rbc_i]) <= 1
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) X_index = X[:100] X_search = X[101:] p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing knn_sk.fit(X_index.get()) D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k) X_orig = X_index if input_type == "dataframe": X_index = cudf.DataFrame(X_index) X_search = cudf.DataFrame(X_search) knn_cu = cuKNN(metric=metric, p=p) knn_cu.fit(X_index) D_cuml, I_cuml = knn_cu.kneighbors(X_search, k) if input_type == "dataframe": assert isinstance(D_cuml, cudf.DataFrame) assert isinstance(I_cuml, cudf.DataFrame) D_cuml_np = D_cuml.to_numpy() I_cuml_np = I_cuml.to_numpy() else: assert isinstance(D_cuml, cp.ndarray) assert isinstance(I_cuml, cp.ndarray) D_cuml_np = D_cuml.get() I_cuml_np = I_cuml.get() with cuml.using_output_type("numpy"): # Assert the cuml model was properly reverted np.testing.assert_allclose(knn_cu.X_m, X_orig.get(), atol=1e-3, rtol=1e-3) if metric == 'braycurtis': diff = D_cuml_np - D_sk # Braycurtis has a few differences, but this is computed by FAISS. # So long as the indices all match below, the small discrepancy # should be okay. assert len(diff[diff > 1e-2]) / X_search.shape[0] < 0.06 else: np.testing.assert_allclose(D_cuml_np, D_sk, atol=1e-3, rtol=1e-3) assert I_cuml_np.all() == I_sk.all()
def test_score(nrows, ncols, n_neighbors, n_clusters, datatype): X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0, cluster_std=0.01) X = X.astype(np.float32) if datatype == "dataframe": X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X)) y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y.reshape(nrows, 1))) knn_cu = cuKNN(n_neighbors=n_neighbors) knn_cu.fit(X, y) assert knn_cu.score(X, y) >= (1.0 - 0.004)
def test_score_dtype(dtype): # Using make_blobs here to check averages and neighborhoods X, y = make_blobs(n_samples=1000, centers=2, cluster_std=0.01, n_features=50, random_state=0) X = X.astype(dtype) y = y.astype(dtype) knn_cu = cuKNN(n_neighbors=5) knn_cu.fit(X, y) pred = knn_cu.predict(X) assert pred.dtype == dtype assert knn_cu.score(X, y) >= 0.9999
def test_knn_return_cumlarray(input_type): n_samples = 50 n_feats = 50 k = 5 X, _ = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) if input_type == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN() knn_cu.fit(X) indices, distances = knn_cu._kneighbors(X, k, _output_cumlarray=True) assert isinstance(indices, CumlArray) assert isinstance(distances, CumlArray)
def test_nearest_neighbors_sparse(shape, metric, n_neighbors, batch_size_index, batch_size_query): nrows, ncols, density = shape if nrows == 1 and n_neighbors > 1: return a = cp.sparse.random(nrows, ncols, format='csr', density=density, random_state=35) b = cp.sparse.random(nrows, ncols, format='csr', density=density, random_state=38) if metric == 'jaccard': a = a.astype('bool').astype('float32') b = b.astype('bool').astype('float32') logger.set_level(logger.level_debug) nn = cuKNN(metric=metric, p=2.0, n_neighbors=n_neighbors, algorithm="brute", output_type="numpy", verbose=logger.level_debug, algo_params={"batch_size_index": batch_size_index, "batch_size_query": batch_size_query}) nn.fit(a) cuD, cuI = nn.kneighbors(b) if metric not in sklearn.neighbors.VALID_METRICS_SPARSE['brute']: a = a.todense() b = b.todense() sknn = skKNN(metric=metric, p=2.0, n_neighbors=n_neighbors, algorithm="brute", n_jobs=-1) sk_X = a.get() sknn.fit(sk_X) skD, skI = sknn.kneighbors(b.get()) cp.testing.assert_allclose(cuD, skD, atol=1e-3, rtol=1e-3) # Jaccard & Chebyshev have a high potential for mismatched indices # due to duplicate distances. We can ignore the indices in this case. if metric not in ['jaccard', 'chebyshev']: cp.testing.assert_allclose(cuI, skI, atol=1e-4, rtol=1e-4)
def test_return_dists(): n_samples = 50 n_feats = 50 k = 5 X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) knn_cu = cuKNN() knn_cu.fit(X) ret = knn_cu.kneighbors(X, k, return_distance=False) assert not isinstance(ret, tuple) assert ret.shape == (n_samples, k) ret = knn_cu.kneighbors(X, k, return_distance=True) assert isinstance(ret, tuple) assert len(ret) == 2
def test_score(nrows, ncols, n_neighbors, n_clusters, datatype): # Using make_blobs here to check averages and neighborhoods X, y = make_blobs(n_samples=nrows, centers=n_clusters, cluster_std=0.01, n_features=ncols, random_state=0) X = X.astype(np.float32) y = y.astype(np.float32) if datatype == "dataframe": X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X)) y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y.reshape(nrows, 1))) knn_cu = cuKNN(n_neighbors=n_neighbors) knn_cu.fit(X, y) assert knn_cu.score(X, y) >= 0.9999
def test_kneighbors_regressor(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0): # Test k-neighbors regression rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y_target = y[:n_test_pts] knn = cuKNN(n_neighbors=n_neighbors) knn.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert np.all(abs(y_pred - y_target) < 0.3)
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type): datasets digits = datasets.load_digits() neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \ else cuKNN(n_neighbors=90) digits_selection = np.random.RandomState(42).choice( [True, False], 1797, replace=True, p=[0.60, 0.40]) selected_digits = digits.data[~digits_selection] neigh.fit(selected_digits) knn_graph = neigh.kneighbors_graph(selected_digits, mode="distance") if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse tsne = TSNE(2, n_neighbors=15, random_state=1, learning_rate=500, angle=0.8) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(selected_digits)) Y = tsne.fit_transform(new_data, True, knn_graph) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocoo()) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocsc()) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) del Y
def test_ivfflat_pred(nrows, ncols, n_neighbors, nlist): algo_params = {'nlist': nlist, 'nprobe': nlist * 0.25} X, y = make_blobs(n_samples=nrows, centers=5, n_features=ncols, random_state=0) knn_cu = cuKNN(algorithm="ivfflat", algo_params=algo_params) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_ann_distances_metrics(algo, metric): X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0) cu_knn = cuKNN(algorithm=algo, metric=metric) cu_knn.fit(X) cu_dist, cu_ind = cu_knn.kneighbors(X, n_neighbors=10, return_distance=True) del cu_knn gc.collect() X = X.get() sk_knn = skKNN(metric=metric) sk_knn.fit(X) sk_dist, sk_ind = sk_knn.kneighbors(X, n_neighbors=10, return_distance=True) return array_equal(sk_dist, cu_dist)
def test_predict_multioutput(datatype): X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32) y = np.array([[15, 2], [5, 4]]).astype(np.int32) if datatype == "dataframe": X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X)) y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y)) knn_cu = cuKNN(n_neighbors=1) knn_cu.fit(X, y) p = knn_cu.predict(X) if datatype == "dataframe": assert isinstance(p, cudf.DataFrame) else: assert isinstance(p, np.ndarray) assert array_equal(p.astype(np.int32), y)
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type, method): digits = test_datasets["digits"].data neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(digits) knn_graph = neigh.kneighbors_graph(digits, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph) if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse tsne = TSNE(n_components=2, n_neighbors=DEFAULT_N_NEIGHBORS, random_state=1, learning_rate_method='none', method=method, min_grad_norm=1e-12, perplexity=DEFAULT_PERPLEXITY) new_data = sp_prefix.csr_matrix(scipy.sparse.csr_matrix(digits)) Y = tsne.fit_transform(new_data, True, knn_graph) if input_type == 'cupy': Y = Y.get() validate_embedding(digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocoo()) if input_type == 'cupy': Y = Y.get() validate_embedding(digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocsc()) if input_type == 'cupy': Y = Y.get() validate_embedding(digits, Y, 0.85)
def test_knn_graph(input_type, mode, output_type, as_instance, nrows, n_feats, p, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) if as_instance: sparse_sk = sklearn.neighbors.kneighbors_graph(X.get(), k, mode=mode, metric=metric, p=p, include_self='auto') else: knn_sk = skKNN(metric=metric, p=p) knn_sk.fit(X.get()) sparse_sk = knn_sk.kneighbors_graph(X.get(), k, mode=mode) if input_type == "dataframe": X = cudf.DataFrame(X) with cuml.using_output_type(output_type): if as_instance: sparse_cu = cuml.neighbors.kneighbors_graph(X, k, mode=mode, metric=metric, p=p, include_self='auto') else: knn_cu = cuKNN(metric=metric, p=p) knn_cu.fit(X) sparse_cu = knn_cu.kneighbors_graph(X, k, mode=mode) assert np.array_equal(sparse_sk.data.shape, sparse_cu.data.shape) assert np.array_equal(sparse_sk.indices.shape, sparse_cu.indices.shape) assert np.array_equal(sparse_sk.indptr.shape, sparse_cu.indptr.shape) assert np.array_equal(sparse_sk.toarray().shape, sparse_cu.toarray().shape) if output_type == 'cupy' or output_type is None: assert cupyx.scipy.sparse.isspmatrix_csr(sparse_cu) else: assert isspmatrix_csr(sparse_cu)