def _load_dataset(dataset, n_rows): if dataset == "make_blobs": local_X, local_y = make_blobs(n_samples=n_rows, n_features=10, centers=200, cluster_std=0.8, random_state=42) local_X = cp.asarray(local_X) local_y = cp.asarray(local_y) else: if dataset == "digits": local_X, local_y = load_digits(return_X_y=True) else: # dataset == "iris" local_X, local_y = load_iris(return_X_y=True) local_X = cp.asarray(local_X) local_y = cp.asarray(local_y) local_X = local_X.repeat(math.ceil(n_rows / len(local_X)), axis=0) local_y = local_y.repeat(math.ceil(n_rows / len(local_y)), axis=0) # Add some gaussian noise local_X += cp.random.standard_normal(local_X.shape, dtype=cp.float32) return local_X, local_y
def test_score(nrows, ncols, nclusters): X, y = make_blobs(nrows, ncols, nclusters, cluster_std=0.01, random_state=10) cuml_kmeans = cuml.KMeans(verbose=1, init="k-means||", n_clusters=nclusters, random_state=10) cuml_kmeans.fit(X) actual_score = cuml_kmeans.score(X) predictions = cuml_kmeans.predict(X) centers = cp.array(cuml_kmeans.cluster_centers_.as_gpu_matrix()) expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = centers[label] dist = np.sqrt(np.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1*expected_score) \ >= actual_score - SCORE_EPS
def test_single_linkage_sklearn_compare(nrows, ncols, nclusters, k, connectivity): X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=1.0, shuffle=False, random_state=42) cuml_agg = AgglomerativeClustering(n_clusters=nclusters, affinity='euclidean', linkage='single', n_neighbors=k, connectivity=connectivity) try: cuml_agg.fit(X) except Exception: cuml_agg.fit(X) sk_agg = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity='euclidean', linkage='single') sk_agg.fit(cp.asnumpy(X)) # Cluster assignments should be exact, even though the actual # labels may differ assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) == 1.0) assert (cuml_agg.n_connected_components_ == sk_agg.n_connected_components_) assert (cuml_agg.n_leaves_ == sk_agg.n_leaves_) assert (cuml_agg.n_clusters_ == sk_agg.n_clusters_)
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype, algo): if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(algorithm=algo) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.to_numpy() else: assert isinstance(neigh_ind, cp.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_weighted_kmeans(nrows, ncols, nclusters, max_weight, random_state): # Using fairly high variance between points in clusters cluster_std = 1.0 np.random.seed(random_state) # set weight per sample to be from 1 to max_weight wt = np.random.randint(1, high=max_weight, size=nrows) X, y = make_blobs(nrows, ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X, sample_weight=wt) cu_score = cuml_kmeans.score(X) sk_kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters) sk_kmeans.fit(cp.asnumpy(X), sample_weight=wt) sk_score = sk_kmeans.score(cp.asnumpy(X)) assert abs(cu_score - sk_score) <= cluster_std * 1.5
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype, algo): if algo == "ivfpq": pytest.xfail("""See Memory access error in IVFPQ : https://github.com/rapidsai/cuml/issues/3318""") if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(algorithm=algo) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() if datatype == "dataframe": assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host() else: assert isinstance(neigh_ind, cp.core.core.ndarray) labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_score(nrows, ncols, nclusters): X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=0.01, shuffle=False, random_state=10) cuml_kmeans = cuml.KMeans(init="k-means||", n_clusters=nclusters, random_state=10, output_type='numpy') cuml_kmeans.fit(X) actual_score = cuml_kmeans.score(X) predictions = cuml_kmeans.predict(X) centers = cuml_kmeans.cluster_centers_ expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = cp.array(centers[label]) dist = cp.sqrt(cp.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1*expected_score) \ >= actual_score - SCORE_EPS
def test_partial_fit(nrows, ncols, n_components, density, batch_size_divider, whiten): X, _ = make_blobs(n_samples=nrows, n_features=ncols, random_state=10) cu_ipca = cuIPCA(n_components=n_components, whiten=whiten) sample_size = int(nrows / batch_size_divider) for i in range(0, nrows, sample_size): cu_ipca.partial_fit(X[i:i + sample_size].copy()) cu_t = cu_ipca.transform(X) cu_inv = cu_ipca.inverse_transform(cu_t) sk_ipca = skIPCA(n_components=n_components, whiten=whiten) X = cp.asnumpy(X) for i in range(0, nrows, sample_size): sk_ipca.partial_fit(X[i:i + sample_size].copy()) sk_t = sk_ipca.transform(X) sk_inv = sk_ipca.inverse_transform(sk_t) assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
def test_ivfpq_pred(nrows, ncols, n_neighbors, nlist, M, n_bits, usePrecomputedTables): pytest.xfail("Warning: IVFPQ might be unstable in this " "version of cuML. This is due to a known issue " "in the FAISS release that this cuML version " "is linked to. (see FAISS issue #1421)") algo_params = { 'nlist': nlist, 'nprobe': int(nlist * 0.2), 'M': M, 'n_bits': n_bits, 'usePrecomputedTables': usePrecomputedTables } X, y = make_blobs(n_samples=nrows, centers=5, n_features=ncols, random_state=0) knn_cu = cuKNN(algorithm="ivfpq", algo_params=algo_params) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def create_rand_blobs(random_state): blobs, _ = make_blobs(n_samples=500, n_features=20, centers=20, order='C', random_state=random_state) return blobs
def test_fit(nrows, ncols, n_components, sparse_input, density, sparse_format, batch_size_divider, whiten): if sparse_format == 'csc': pytest.skip("cupyx.scipy.sparse.csc.csc_matrix does not support" " indexing as of cupy 7.6.0") if sparse_input: X = cupyx.scipy.sparse.random(nrows, ncols, density=density, random_state=10, format=sparse_format) else: X, _ = make_blobs(n_samples=nrows, n_features=ncols, random_state=10) cu_ipca = cuIPCA(n_components=n_components, whiten=whiten, batch_size=int(nrows / batch_size_divider)) cu_ipca.fit(X) cu_t = cu_ipca.transform(X) cu_inv = cu_ipca.inverse_transform(cu_t) sk_ipca = skIPCA(n_components=n_components, whiten=whiten, batch_size=int(nrows / batch_size_divider)) if sparse_input: X = X.get() else: X = cp.asnumpy(X) sk_ipca.fit(X) sk_t = sk_ipca.transform(X) sk_inv = sk_ipca.inverse_transform(sk_t) assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
def test_ivfsq_pred(qtype, encodeResidual, nrows, ncols, n_neighbors, nlist): algo_params = { 'nlist': nlist, 'nprobe': nlist * 0.25, 'qtype': qtype, 'encodeResidual': encodeResidual } X, y = make_blobs(n_samples=nrows, centers=5, n_features=ncols, random_state=0) logger.set_level(logger.level_debug) knn_cu = cuKNN(algorithm="ivfsq", algo_params=algo_params) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_traditional_kmeans_plus_plus_init(nrows, ncols, nclusters, random_state): # Using fairly high variance between points in clusters cluster_std = 1.0 X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) cu_score = cuml_kmeans.score(X) kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters) kmeans.fit(cp.asnumpy(X)) sk_score = kmeans.score(cp.asnumpy(X)) assert abs(cu_score - sk_score) <= cluster_std * 1.5
def test_knn_x_none(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing knn_sk.fit(X.get()) D_sk, I_sk = knn_sk.kneighbors(X=None, n_neighbors=k) X_orig = X if input_type == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(metric=metric, p=p, output_type="numpy") knn_cu.fit(X) D_cuml, I_cuml = knn_cu.kneighbors(X=None, n_neighbors=k) # Assert the cuml model was properly reverted cp.testing.assert_allclose(knn_cu.X_m, X_orig, atol=1e-5, rtol=1e-4) # Allow a max relative diff of 10% and absolute diff of 1% cp.testing.assert_allclose(D_cuml, D_sk, atol=5e-2, rtol=1e-1) assert I_cuml.all() == I_sk.all()
def test_score(nrows, ncols, nclusters, random_state): X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=1.0, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(init="k-means||", n_clusters=nclusters, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) actual_score = cuml_kmeans.score(X) predictions = cuml_kmeans.predict(X) centers = cuml_kmeans.cluster_centers_ expected_score = 0.0 for idx, label in enumerate(predictions): x = X[idx, :] y = cp.array(centers[label, :], dtype=cp.float32) sq_euc_dist = cp.sum(cp.square((x - y))) expected_score += sq_euc_dist expected_score *= -1 cp.testing.assert_allclose(actual_score, expected_score, atol=0.1, rtol=1e-5)
def test_ivfpq_pred(nrows, ncols, n_neighbors, nlist, M, n_bits, usePrecomputedTables): algo_params = { 'nlist': nlist, 'nprobe': int(nlist * 0.2), 'M': M, 'n_bits': n_bits, 'usePrecomputedTables': usePrecomputedTables } X, y = make_blobs(n_samples=nrows, centers=5, n_features=ncols, random_state=0) knn_cu = cuKNN(algorithm="ivfpq", algo_params=algo_params) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_kmeans_sequential_plus_plus_init(nrows, ncols, nclusters, random_state): # Using fairly high variance between points in clusters cluster_std = 1.0 X, y = make_blobs(nrows, ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(verbose=0, init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) cu_score = cuml_kmeans.score(X) kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters) kmeans.fit(X.copy_to_host()) sk_score = kmeans.score(X.copy_to_host()) assert abs(cu_score - sk_score) <= cluster_std * 1.5
def test_n_init_cluster_consistency(random_state): cluster_std = 1.0 nrows = 100000 ncols = 100 nclusters = 8 X, y = make_blobs(nrows, ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(verbose=0, init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) initial_clusters = cuml_kmeans.cluster_centers_ cuml_kmeans = cuml.KMeans(verbose=0, init="k-means++", n_clusters=nclusters, n_init=10, random_state=random_state, output_type='numpy') cuml_kmeans.fit(X) assert array_equal(initial_clusters, cuml_kmeans.cluster_centers_)
def test_lasso_attributes(): X, y = make_blobs() clf = cumlLasso() clf.fit(X, y) attrs = ["dtype", "solver_model", "coef_", "intercept_", "solver_model", "l1_ratio", "n_cols"] for attr in attrs: assert hasattr(clf, attr)
def test_logistic_regression_attributes(): X, y = make_blobs() clf = cuLog().fit(X, y, convert_dtype=True) attrs = ["dtype", "solver_model", "coef_", "intercept_", "l1_ratio", "n_cols", "C", "penalty", "fit_intercept", "solver"] for attr in attrs: assert hasattr(clf, attr)
def test_elastic_net_attributes(): X, y = make_blobs() clf = cumlElastic(fit_intercept=False) clf.fit(X, y) attrs = ["dtype", "solver_model", "coef_", "intercept_", "l1_ratio", "n_cols", "alpha", "max_iter", "fit_intercept"] for attr in attrs: assert hasattr(clf, attr)
def test_mbsgd_classifier_attributes(): X, y = make_blobs() clf = cumlMBSGClassifier() clf.fit(X, y) attrs = ["dtype", "solver_model", "coef_", "intercept_", "l1_ratio", "n_cols", "eta0", "batch_size", "fit_intercept", "penalty"] for attr in attrs: assert hasattr(clf, attr)
def test_mbsgd_regressor_attributes(): X, y = make_blobs() clf = cumlMBSGRegressor() clf.fit(X, y) attrs = ["dtype", "solver_model", "coef_", "intercept_", "l1_ratio", "n_cols", "loss", "eta0", "batch_size", "epochs"] for attr in attrs: assert hasattr(clf, attr)
def test_self_neighboring(datatype, metric_p, nrows): """Test that searches using an indexed vector itself return sensible results for that vector For L2-derived metrics, this specifically exercises the slow high-precision mode used to correct for approximation errors in L2 computation during NN searches. """ ncols = 1000 n_clusters = 10 n_neighbors = 3 metric, p = metric_p if not has_scipy(): pytest.skip('Skipping test_neighborhood_predictions because ' + 'Scipy is missing') X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, random_state=0) if datatype == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(metric=metric, n_neighbors=n_neighbors) knn_cu.fit(X) neigh_dist, neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=True, two_pass_precision=True) if datatype == 'dataframe': assert isinstance(neigh_ind, cudf.DataFrame) neigh_ind = neigh_ind.to_numpy() neigh_dist = neigh_dist.to_numpy() else: assert isinstance(neigh_ind, cp.ndarray) neigh_ind = neigh_ind.get() neigh_dist = neigh_dist.get() neigh_ind = neigh_ind[:, 0] neigh_dist = neigh_dist[:, 0] assert_array_equal( neigh_ind, np.arange(0, neigh_dist.shape[0]), ) assert_allclose(neigh_dist, np.zeros(neigh_dist.shape, dtype=neigh_dist.dtype), atol=1e-4)
def get_data_consistency_test(): cluster_std = 1.0 nrows = 1000 ncols = 50 nclusters = 8 X, y = make_blobs(nrows, ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=0) return X, y
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) X_index = X[:100] X_search = X[101:] p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing knn_sk.fit(X_index.get()) D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k) X_orig = X_index if input_type == "dataframe": X_index = cudf.DataFrame(X_index) X_search = cudf.DataFrame(X_search) knn_cu = cuKNN(metric=metric, p=p) knn_cu.fit(X_index) D_cuml, I_cuml = knn_cu.kneighbors(X_search, k) if input_type == "dataframe": assert isinstance(D_cuml, cudf.DataFrame) assert isinstance(I_cuml, cudf.DataFrame) D_cuml_np = D_cuml.to_numpy() I_cuml_np = I_cuml.to_numpy() else: assert isinstance(D_cuml, cp.ndarray) assert isinstance(I_cuml, cp.ndarray) D_cuml_np = D_cuml.get() I_cuml_np = I_cuml.get() with cuml.using_output_type("numpy"): # Assert the cuml model was properly reverted np.testing.assert_allclose(knn_cu.X_m, X_orig.get(), atol=1e-3, rtol=1e-3) if metric == 'braycurtis': diff = D_cuml_np - D_sk # Braycurtis has a few differences, but this is computed by FAISS. # So long as the indices all match below, the small discrepancy # should be okay. assert len(diff[diff > 1e-2]) / X_search.shape[0] < 0.06 else: np.testing.assert_allclose(D_cuml_np, D_sk, atol=1e-3, rtol=1e-3) assert I_cuml_np.all() == I_sk.all()
def test_fuzzy_simplicial_set(n_rows, n_features, n_neighbors, precomputed_nearest_neighbors): n_clusters = 30 random_state = 42 metric = 'euclidean' X, _ = make_blobs(n_samples=n_rows, centers=n_clusters, n_features=n_features, random_state=random_state) if precomputed_nearest_neighbors: nn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) nn.fit(X) knn_dists, knn_indices = nn.kneighbors(X, n_neighbors, return_distance=True) cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state, metric, knn_indices=knn_indices, knn_dists=knn_dists) knn_indices = knn_indices.get() knn_dists = knn_dists.get() ref_fss_graph = ref_fuzzy_simplicial_set( X, n_neighbors, random_state, metric, knn_indices=knn_indices, knn_dists=knn_dists)[0].tocoo() else: cu_fss_graph = cu_fuzzy_simplicial_set(X, n_neighbors, random_state, metric) X = X.get() ref_fss_graph = ref_fuzzy_simplicial_set(X, n_neighbors, random_state, metric)[0].tocoo() cu_fss_graph = cu_fss_graph.todense() ref_fss_graph = cp.sparse.coo_matrix(ref_fss_graph).todense() assert correctness_sparse(ref_fss_graph, cu_fss_graph, atol=0.1, rtol=0.2, threshold=0.95)
def test_kmeans_clusters_blobs(nrows, ncols, nclusters, random_state, cluster_std): X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=cluster_std, shuffle=False, random_state=random_state,) cuml_kmeans = cuml.KMeans(init="k-means||", n_clusters=nclusters, random_state=random_state, output_type='numpy') preds = cuml_kmeans.fit_predict(X) assert adjusted_rand_score(cp.asnumpy(preds), cp.asnumpy(y)) >= 0.99
def test_return_dists(): n_samples = 50 n_feats = 50 k = 5 X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) knn_cu = cuKNN() knn_cu.fit(X) ret = knn_cu.kneighbors(X, k, return_distance=False) assert not isinstance(ret, tuple) assert ret.shape == (n_samples, k) ret = knn_cu.kneighbors(X, k, return_distance=True) assert isinstance(ret, tuple) assert len(ret) == 2
def test_ann_distances_metrics(algo, metric): X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0) cu_knn = cuKNN(algorithm=algo, metric=metric) cu_knn.fit(X) cu_dist, cu_ind = cu_knn.kneighbors(X, n_neighbors=10, return_distance=True) del cu_knn gc.collect() X = X.get() sk_knn = skKNN(metric=metric) sk_knn.fit(X) sk_dist, sk_ind = sk_knn.kneighbors(X, n_neighbors=10, return_distance=True) return array_equal(sk_dist, cu_dist)