def test_targetencoder_multi_column(): """ Test jointly encoding multiple columns """ train = cudf.DataFrame({ 'cat_1': ['a', 'b', 'b', 'a', 'a', 'b'], 'cat_2': [1, 1, 2, 2, 1, 2], 'label': [1, 0, 1, 1, 0, 1] }) test = cudf.DataFrame({ 'cat_1': ['b', 'b', 'a', 'b'], 'cat_2': [1, 2, 1, 2] }) encoder = TargetEncoder() train_encoded = encoder.fit_transform(train[['cat_1', 'cat_2']], train.label) test_encoded = encoder.transform(test[['cat_1', 'cat_2']]) train_answer = np.array([2. / 3, 2. / 3, 1., 2. / 3, 2. / 3, 1.]) test_answer = np.array([0., 1., 0.5, 1.]) assert array_equal(train_encoded, train_answer) assert array_equal(test_encoded, test_answer) encoder = TargetEncoder() encoder.fit(train[['cat_1', 'cat_2']], train.label) train_encoded = encoder.transform(train[['cat_1', 'cat_2']]) test_encoded = encoder.transform(test[['cat_1', 'cat_2']]) assert array_equal(train_encoded, train_answer) assert array_equal(test_encoded, test_answer)
def test_predict_proba(nrows, ncols, n_neighbors, n_clusters, datatype): X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, cluster_std=0.01, random_state=0) X = X.astype(np.float32) X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype) knn_cu = cuKNN(n_neighbors=n_neighbors) knn_cu.fit(X_train, y_train) predictions = knn_cu.predict_proba(X_test) if datatype == "dataframe": assert isinstance(predictions, cudf.DataFrame) predictions = predictions.to_numpy() y_test = y_test.to_numpy().reshape(y_test.shape[0]) else: assert isinstance(predictions, np.ndarray) y_hat = np.argmax(predictions, axis=1) assert array_equal(y_hat.astype(np.int32), y_test.astype(np.int32)) assert array_equal(predictions.sum(axis=1), np.ones(y_test.shape[0]))
def test_predict_proba_multioutput(input_type, output_type): X = np.array([[0, 0, 1, 0], [1, 0, 1, 0]]).astype(np.float32) y = np.array([[15, 2], [5, 4]]).astype(np.int32) if input_type == "cudf": X = cudf.DataFrame(X) y = cudf.DataFrame(y) elif input_type == "cupy": X = cp.asarray(X) y = cp.asarray(y) expected = (np.array([[0., 1.], [1., 0.]]).astype(np.float32), np.array([[1., 0.], [0., 1.]]).astype(np.float32)) knn_cu = cuKNN(n_neighbors=1, output_type=output_type) knn_cu.fit(X, y) p = knn_cu.predict_proba(X) assert isinstance(p, tuple) for i in p: if output_type == "cudf": assert isinstance(i, cudf.DataFrame) elif output_type == "numpy": assert isinstance(i, np.ndarray) elif output_type == "cupy": assert isinstance(i, cp.ndarray) assert array_equal(p[0].astype(np.float32), expected[0]) assert array_equal(p[1].astype(np.float32), expected[1])
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters, datatype): X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, cluster_std=0.01, random_state=0) X = X.astype(np.float32) X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype) knn_cu = cuKNN(n_neighbors=n_neighbors) knn_cu.fit(X_train, y_train) predictions = knn_cu.predict(X_test) if datatype == "dataframe": assert isinstance(predictions, cudf.Series) assert array_equal(predictions.to_frame().astype(np.int32), y_test.astype(np.int32)) else: assert isinstance(predictions, np.ndarray) assert array_equal(predictions.astype(np.int32), y_test.astype(np.int32))
def test_basic_functions(labels, multipart, client): fit_labels, xform_labels = labels s = cp.asarray(fit_labels, dtype=np.int32) df = dask.array.from_array(s) s2 = cp.asarray(xform_labels, dtype=np.int32) df2 = dask.array.from_array(s2) if multipart: df = df.rechunk((1, )) df2 = df2.rechunk((1, )) binarizer = LabelBinarizer(client=client, sparse_output=False) binarizer.fit(df) assert array_equal(cp.asnumpy(binarizer.classes_), np.unique(cp.asnumpy(s))) xformed = binarizer.transform(df2) xformed = xformed.map_blocks(lambda x: x.get(), dtype=cp.float32) xformed.compute_chunk_sizes() assert xformed.compute().shape[1] == binarizer.classes_.shape[0] original = binarizer.inverse_transform(xformed) test = original.compute() assert array_equal(cp.asnumpy(test), xform_labels)
def test_svd_flip(): x = cp.array(range(-10, 80)).reshape((9, 10)) u, s, v = cp.linalg.svd(x, full_matrices=False) u_true, v_true = _svd_flip(u, v, u_based_decision=True) reco_true = cp.dot(u_true * s, v_true) u_false, v_false = _svd_flip(u, v, u_based_decision=False) reco_false = cp.dot(u_false * s, v_false) assert array_equal(reco_true, x) assert array_equal(reco_false, x)
def test_pca_fit(data_info, input_type, client): nrows, ncols, n_parts = data_info if nrows == int(9e6) and pytest.max_gpu_memory < 48: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 256 ncols = ncols * pytest.max_gpu_memory // 256 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_train) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X_cpu) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.to_numpy() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_compare_skl(nrows, ncols, nclusters, n_parts, n_neighbors, streams_per_handle, reverse_worker_order, client): from cuml.dask.neighbors import NearestNeighbors as daskNN from sklearn.datasets import make_blobs nrows = _scale_rows(client, nrows) X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, random_state=0) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, n_parts, reverse_worker_order) from dask.distributed import wait wait(X_cudf) dist = np.array([len(v) for v in client.has_what().values()]) assert np.all(dist == dist[0]) cumlModel = daskNN(n_neighbors=n_neighbors, streams_per_handle=streams_per_handle) cumlModel.fit(X_cudf) out_d, out_i = cumlModel.kneighbors(X_cudf) local_i = np.array(out_i.compute().to_numpy(), dtype="int64") sklModel = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y) skl_y_hat = sklModel.predict(X) y_hat, _ = predict(local_i, y, n_neighbors) sk_d, sk_i = sklModel.kneighbors(X) sk_i = sk_i.astype("int64") assert array_equal(local_i[:, 0], np.arange(nrows)) diff = sk_i - local_i n_diff = len(diff[diff > 0]) perc_diff = n_diff / (nrows * n_neighbors) assert perc_diff <= 3e-3 assert array_equal(y_hat, skl_y_hat)
def test_targetencoder_fit_transform(): train = cudf.DataFrame({ 'category': ['a', 'b', 'b', 'a'], 'label': [1, 0, 1, 1] }) encoder = TargetEncoder() train_encoded = encoder.fit_transform(train.category, train.label) answer = np.array([1., 1., 0., 1.]) assert array_equal(train_encoded, answer) encoder = TargetEncoder() encoder.fit(train.category, train.label) train_encoded = encoder.transform(train.category) assert array_equal(train_encoded, answer)
def test_one_category(): train = cudf.DataFrame({ 'category': ['a', 'a', 'a', 'a'], 'label': [3, 0, 0, 3] }) test = cudf.DataFrame({'category': ['c', 'b', 'a', 'd']}) encoder = TargetEncoder() train_encoded = encoder.fit_transform(train.category, train.label) answer = np.array([1., 2., 2., 1.]) assert array_equal(train_encoded, answer) test_encoded = encoder.transform(test.category) answer = np.array([1.5, 1.5, 1.5, 1.5]) assert array_equal(test_encoded, answer)
def test_targetencoder_var(): train = cudf.DataFrame({ 'category': ['a', 'b', 'b', 'b'], 'label': [1, 0, 1, 1] }) encoder = TargetEncoder(stat='var') train_encoded = encoder.fit_transform(train.category, train.label) answer = np.array([.25, 0., .5, .5]) assert array_equal(train_encoded, answer) encoder = TargetEncoder(stat='var') encoder.fit(train.category, train.label) train_encoded = encoder.transform(train.category) assert array_equal(train_encoded, answer)
def test_logistic_regression_weighting(regression_dataset, option, test_status): regression_type, data, coef, output = regression_dataset[test_status] class_weight = None sample_weight = None if option == 'sample_weight': n_samples = data.shape[0] sample_weight = np.abs(np.random.rand(n_samples)) elif option == 'class_weight': class_weight = np.random.rand(2) class_weight = {0: class_weight[0], 1: class_weight[1]} elif option == 'balanced': class_weight = 'balanced' culog = cuLog(fit_intercept=False, class_weight=class_weight) culog.fit(data, output, sample_weight=sample_weight) sklog = skLog(fit_intercept=False, class_weight=class_weight) sklog.fit(data, output, sample_weight=sample_weight) skcoef = np.squeeze(sklog.coef_) cucoef = np.squeeze(culog.coef_) if regression_type == 'binary': skcoef /= np.linalg.norm(skcoef) cucoef /= np.linalg.norm(cucoef) unit_tol = 0.04 total_tol = 0.08 elif regression_type.startswith('multiclass'): skcoef = skcoef.T skcoef /= np.linalg.norm(skcoef, axis=1)[:, None] cucoef /= np.linalg.norm(cucoef, axis=1)[:, None] unit_tol = 0.2 total_tol = 0.3 equality = array_equal(skcoef, cucoef, unit_tol=unit_tol, total_tol=total_tol) if not equality: print('\ncoef.shape: ', coef.shape) print('coef:\n', coef) print('cucoef.shape: ', cucoef.shape) print('cucoef:\n', cucoef) assert equality cuOut = culog.predict(data) skOut = sklog.predict(data) assert array_equal(skOut, cuOut, unit_tol=unit_tol, total_tol=total_tol)
def test_dbscan_default(name, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN eps = 0.5 default_base = { 'quantile': .3, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = 500 pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_predict_multioutput(input_type, output_type): X = np.array([[0, 0, 1, 0], [1, 0, 1, 0]]).astype(np.float32) y = np.array([[15, 2], [5, 4]]).astype(np.int32) if input_type == "cudf": X = cudf.DataFrame(X) y = cudf.DataFrame(y) elif input_type == "cupy": X = cp.asarray(X) y = cp.asarray(y) knn_cu = cuKNN(n_neighbors=1, output_type=output_type) knn_cu.fit(X, y) p = knn_cu.predict(X) if output_type == "cudf": assert isinstance(p, cudf.DataFrame) elif output_type == "numpy": assert isinstance(p, np.ndarray) elif output_type == "cupy": assert isinstance(p, cp.ndarray) assert array_equal(p.astype(np.int32), y)
def test_nonmonotonic_labels(n_classes, n_rows, n_cols, datatype, n_neighbors): X, y = make_blobs(n_samples=n_rows, centers=n_classes, n_features=n_cols, cluster_std=0.01, random_state=0) X = X.astype(np.float32) # Draw labels from non-monotonically increasing set classes = np.arange(0, n_classes * 5, 5) for i in range(n_classes): y[y == i] = classes[i] X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype) knn_cu = cuKNN(n_neighbors=n_neighbors) knn_cu.fit(X_train, y_train) p = knn_cu.predict(X_test) if datatype == "dataframe": assert isinstance(p, cudf.Series) p = p.to_frame().to_numpy().reshape(p.shape[0]) y_test = y_test.to_numpy().reshape(y_test.shape[0]) assert array_equal(p.astype(np.int32), y_test.astype(np.int32))
def test_umap_fit_transform_score(nrows, n_feats): n_samples = nrows n_features = n_feats data, labels = make_blobs(n_samples=n_samples, n_features=n_features, centers=10, random_state=42) model = umap.UMAP(n_neighbors=10, min_dist=0.1) cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01) embedding = model.fit_transform(data) cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True) assert not np.isnan(embedding).any() assert not np.isnan(cuml_embedding).any() if nrows < 500000: cuml_score = adjusted_rand_score(labels, KMeans(10).fit_predict( cuml_embedding)) score = adjusted_rand_score(labels, KMeans(10).fit_predict(embedding)) assert array_equal(score, cuml_score, 1e-2, with_sign=True)
def test_batch_size(nrows, ncols, n_parts, batch_size, client): n_neighbors = 10 n_clusters = 5 from cuml.dask.neighbors import NearestNeighbors as daskNN from sklearn.datasets import make_blobs nrows = _scale_rows(client, nrows) X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=n_clusters, random_state=0) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, n_parts) cumlModel = daskNN(n_neighbors=n_neighbors, batch_size=batch_size, streams_per_handle=5) cumlModel.fit(X_cudf) out_d, out_i = cumlModel.kneighbors(X_cudf) local_i = out_i.compute().to_numpy() y_hat, _ = predict(local_i, y, n_neighbors) assert array_equal(y_hat, y)
def test_core_point_prop3(): params = {'eps': 1.1, 'min_samples': 4} # The input looks like a two-barred (orhodox) cross or # two stars sharing a link: # . . # . . . . . # . . # There are 2 core-points but they are not reachable from each other # So there should be two clusters. # However, the link that is shared between the stars # actually has an ambiguous label (to the best of my knowledge) # as it will depend on the order in which we process the core-points. # So we exclude that point from the comparison with sklearn # TODO: the above text does not correspond to the actual test! X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1], [4, -1], [5, 0], [2, 0]], dtype=np.float32) cuml_dbscan = cuDBSCAN(**params) cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(**params) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_pca_defaults(n_samples, n_features, sparse): # FIXME: Disable the case True-300-200 due to flaky test if sparse and n_features == 300 and n_samples == 200: pytest.xfail('Skipping the case True-300-200 due to flaky test') if sparse: X = cupyx.scipy.sparse.random(n_samples, n_features, density=0.03, dtype=cp.float32, random_state=10) else: X, Y = make_multilabel_classification(n_samples=n_samples, n_features=n_features, n_classes=2, n_labels=1, random_state=1) cupca = cuPCA() cupca.fit(X) curesult = cupca.transform(X) cupca.handle.sync() if sparse: X = X.toarray().get() skpca = skPCA() skpca.fit(X) skresult = skpca.transform(X) assert skpca.svd_solver == cupca.svd_solver assert cupca.components_.shape[0] == skpca.components_.shape[0] assert curesult.shape == skresult.shape assert array_equal(curesult, skresult, 1e-3, with_sign=False)
def test_ridge_regression_model(datatype, algorithm, nrows, column_info): if algorithm == "svd" and nrows > 46340: pytest.skip("svd solver is not supported for the data that has more" "than 46340 rows or columns if you are using CUDA version" "10.x") ncols, n_info = column_info X_train, X_test, y_train, y_test = make_regression_dataset( datatype, nrows, ncols, n_info ) # Initialization of cuML's ridge regression model curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm) # fit and predict cuml ridge regression model curidge.fit(X_train, y_train) curidge_predict = curidge.predict(X_test) if nrows < 500000: # sklearn ridge regression model initialization, fit and predict skridge = skRidge(fit_intercept=False, normalize=False) skridge.fit(X_train, y_train) skridge_predict = skridge.predict(X_test) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
def test_partial_fit(nrows, ncols, n_components, density, batch_size_divider, whiten): X, _ = make_blobs(n_samples=nrows, n_features=ncols, random_state=10) cu_ipca = cuIPCA(n_components=n_components, whiten=whiten) sample_size = int(nrows / batch_size_divider) for i in range(0, nrows, sample_size): cu_ipca.partial_fit(X[i:i + sample_size].copy()) cu_t = cu_ipca.transform(X) cu_inv = cu_ipca.inverse_transform(cu_t) sk_ipca = skIPCA(n_components=n_components, whiten=whiten) X = cp.asnumpy(X) for i in range(0, nrows, sample_size): sk_ipca.partial_fit(X[i:i + sample_size].copy()) sk_t = sk_ipca.transform(X) sk_inv = sk_ipca.inverse_transform(sk_t) assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
def test_fit(nrows, ncols, n_components, sparse_input, density, sparse_format, batch_size_divider, whiten): if sparse_format == 'csc': pytest.skip("cupyx.scipy.sparse.csc.csc_matrix does not support" " indexing as of cupy 7.6.0") if sparse_input: X = cupyx.scipy.sparse.random(nrows, ncols, density=density, random_state=10, format=sparse_format) else: X, _ = make_blobs(n_samples=nrows, n_features=ncols, random_state=10) cu_ipca = cuIPCA(n_components=n_components, whiten=whiten, batch_size=int(nrows / batch_size_divider)) cu_ipca.fit(X) cu_t = cu_ipca.transform(X) cu_inv = cu_ipca.inverse_transform(cu_t) sk_ipca = skIPCA(n_components=n_components, whiten=whiten, batch_size=int(nrows / batch_size_divider)) if sparse_input: X = X.get() else: X = cp.asnumpy(X) sk_ipca.fit(X) sk_t = sk_ipca.transform(X) sk_inv = sk_ipca.inverse_transform(sk_t) assert array_equal(cu_inv, sk_inv, 5e-5, with_sign=True)
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples): X, y = make_blobs(n_samples, centers=1, cluster_std=8.0, center_box=(-100.0, 100.0), random_state=8) X = X.astype(datatype) handle, stream = get_handle(use_handle) eps = 0.5 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) eps = 1 cuml_dbscan = cuDBSCAN(eps=eps, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN # 2-dimensional dataset for easy distance matrix computation X, y = make_blobs(n_samples=nrows, cluster_std=0.01, n_features=2, random_state=0) # Precompute distances X_dist = pairwise_distances(X).astype(datatype) eps = 1 cuml_dbscan = cuDBSCAN(eps=eps, min_samples=2, metric='precomputed', max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X_dist, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=2, metric='precomputed', algorithm="brute") sk_labels = sk_dbscan.fit_predict(X_dist) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_predict_large_n_classes(datatype): nrows = 10000 ncols = 100 n_neighbors = 2 n_clusters = 1000 X, y = make_blobs(n_samples=nrows, centers=n_clusters, n_features=ncols, cluster_std=0.01, random_state=0) X = X.astype(np.float32) X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype) knn_cu = cuKNN(n_neighbors=n_neighbors) knn_cu.fit(X_train, y_train) y_hat = knn_cu.predict(X_test) if datatype == "dataframe": y_hat = y_hat.to_numpy() y_test = y_test.to_numpy().ravel() assert array_equal(y_hat.astype(np.int32), y_test.astype(np.int32))
def test_weighted_ridge(datatype, algorithm, fit_intercept, normalize, distribution): nrows, ncols, n_info = 1000, 20, 10 max_weight = 10 noise = 20 X_train, X_test, y_train, y_test = make_regression_dataset( datatype, nrows, ncols, n_info, noise=noise ) # set weight per sample to be from 1 to max_weight if distribution == "uniform": wt = np.random.randint(1, high=max_weight, size=len(X_train)) elif distribution == "exponential": wt = np.random.exponential(size=len(X_train)) else: wt = np.random.lognormal(size=len(X_train)) # Initialization of cuML's linear regression model curidge = cuRidge(fit_intercept=fit_intercept, normalize=normalize, solver=algorithm) # fit and predict cuml linear regression model curidge.fit(X_train, y_train, sample_weight=wt) curidge_predict = curidge.predict(X_test) # sklearn linear regression model initialization, fit and predict skridge = skRidge(fit_intercept=fit_intercept, normalize=normalize) skridge.fit(X_train, y_train, sample_weight=wt) skridge_predict = skridge.predict(X_test) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
def test_umap_fit_transform_trust(name, target_metric): if name == 'iris': iris = datasets.load_iris() data = iris.data labels = iris.target elif name == 'digits': digits = datasets.load_digits(n_class=5) data = digits.data labels = digits.target elif name == 'wine': wine = datasets.load_wine() data = wine.data labels = wine.target else: data, labels = make_blobs(n_samples=500, n_features=10, centers=10, random_state=42) model = umap.UMAP(n_neighbors=10, min_dist=0.01, target_metric=target_metric) cuml_model = cuUMAP(n_neighbors=10, min_dist=0.01, target_metric=target_metric) embedding = model.fit_transform(data) cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True) trust = trustworthiness(data, embedding, n_neighbors=10) cuml_trust = trustworthiness(data, cuml_embedding, n_neighbors=10) assert array_equal(trust, cuml_trust, 1e-1, with_sign=True)
def test_tsvd_fit_transform(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) Xcutsvd = cutsvd.fit_transform(X) cutsvd.handle.sync() if name != 'blobs': assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def test_logistic_regression_decision_function( dtype, nrows, column_info, num_classes, fit_intercept, sparse_input ): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes ) X_train = csr_matrix(X_train) if sparse_input else X_train X_test = csr_matrix(X_test) if sparse_input else X_test y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(fit_intercept=fit_intercept, output_type="numpy") culog.fit(X_train, y_train) sklog = skLog(fit_intercept=fit_intercept) sklog.coef_ = culog.coef_.T if fit_intercept: sklog.intercept_ = culog.intercept_ else: skLog.intercept_ = 0 sklog.classes_ = np.arange(num_classes) cu_dec_func = culog.decision_function(X_test) if num_classes > 2: cu_dec_func = cu_dec_func.T sk_dec_func = sklog.decision_function(X_test) assert array_equal(cu_dec_func, sk_dec_func)