def test_basic_functions(labels, dtype, sparse_output): fit_labels, xform_labels = labels skl_bin = skLB(sparse_output=sparse_output) skl_bin.fit(fit_labels) fit_labels = cp.asarray(fit_labels, dtype=dtype) xform_labels = cp.asarray(xform_labels, dtype=dtype) binarizer = LabelBinarizer(sparse_output=sparse_output) binarizer.fit(fit_labels) assert array_equal(binarizer.classes_.get(), np.unique(fit_labels.get())) xformed = binarizer.transform(xform_labels) if sparse_output: skl_bin_xformed = skl_bin.transform(xform_labels.get()) if has_scipy(): import scipy.sparse else: pytest.skip('Skipping test_basic_functions(sparse_output=True) ' + 'because Scipy is missing') skl_csr = scipy.sparse.coo_matrix(skl_bin_xformed).tocsr() cuml_csr = xformed array_equal(skl_csr.data, cuml_csr.data.get()) # #todo: Support sparse inputs # xformed = xformed.todense().astype(dtype) assert xformed.shape[1] == binarizer.classes_.shape[0] original = binarizer.inverse_transform(xformed) assert array_equal(original.get(), xform_labels.get())
def test_lars_attributes(datatype, params): X, y = load_boston(return_X_y=True) X = X.astype(datatype) y = y.astype(datatype) culars = cuLars(**params) culars.fit(X, y) sklars = skLars(**params) sklars.fit(X, y) assert culars.score(X, y) >= sklars.score(X, y) - 0.01 limit_max_iter = "n_nonzero_coefs" in params if limit_max_iter: n_iter_tol = 0 else: n_iter_tol = 2 assert abs(culars.n_iter_ - sklars.n_iter_) <= n_iter_tol tol = 1e-4 if params.pop("fit_intercept", True) else 1e-1 n = min(culars.n_iter_, sklars.n_iter_) assert array_equal(culars.alphas_[:n], sklars.alphas_[:n], unit_tol=tol, total_tol=1e-4) assert array_equal(culars.active_[:n], sklars.active_[:n]) if limit_max_iter: assert array_equal(culars.coef_, sklars.coef_) if hasattr(sklars, 'coef_path_'): assert array_equal(culars.coef_path_, sklars.coef_path_[sklars.active_], unit_tol=1e-3) intercept_diff = abs(culars.intercept_ - sklars.intercept_) if abs(sklars.intercept_) > 1e-6: intercept_diff /= sklars.intercept_ assert intercept_diff <= 1e-3
def test_pca_fit(nrows, ncols, n_parts, input_type, cluster): client = Client(cluster) try: from cuml.dask.decomposition import PCA as daskPCA from sklearn.decomposition import PCA from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) try: cupca = daskPCA(n_components=5, whiten=True) cupca.fit(X_train) except Exception as e: print(str(e)) skpca = PCA(n_components=5, whiten=True, svd_solver="full") skpca.fit(X_cpu) from cuml.test.utils import array_equal all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign) finally: client.close()
def test_lasso(input_type, selection): n_samples = 20 n_feats = 5 dtype = np.float64 train_rows = np.int32(n_samples * 0.8) X, y = make_regression(n_samples=n_samples, n_features=n_feats, n_informative=n_feats, random_state=0) X_test = np.array(X[train_rows:, 0:]).astype(dtype) y_train = np.array(y[0:train_rows, ]).astype(dtype) y_test = np.array(y[train_rows:, ]).astype(dtype) X_train = np.array(X[0:train_rows, :]).astype(dtype) sklas = Lasso(alpha=np.array([0.01]), fit_intercept=True, normalize=False, max_iter=1000, selection=selection, tol=1e-10) sklas.fit(X_train, y_train) sk_predict = sklas.predict(X_test) cu_lasso = cuLasso(alpha=np.array([0.01]), fit_intercept=True, normalize=False, max_iter=1000, selection=selection, tol=1e-10) if input_type == 'dataframe': X_train = pd.DataFrame( {'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])}) y_train = pd.DataFrame({'fea0': y[0:train_rows, ]}) X_test = pd.DataFrame( {'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_train) y_cudf = y_train.values y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) X_cudf_test = cudf.DataFrame.from_pandas(X_test) cu_lasso.fit(X_cudf, y_cudf) cu_predict = cu_lasso.predict(X_cudf_test).to_array() else: cu_lasso.fit(X, y) cu_predict = cu_lasso.predict(X_test).to_array() error_sk = mean_squared_error(y_test, sk_predict) error_cu = mean_squared_error(y_test, cu_predict) assert array_equal(error_sk, error_cu, 1e-2, with_sign=True)
def test_nonmonotonic_labels(): X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32) y = np.array([15, 5]).astype(np.int32) knn_cu = cuKNN(n_neighbors=1) knn_cu.fit(X, y) p = knn_cu.predict(X) assert array_equal(p.astype(np.int32), y)
def test_output_args(small_classifier_and_preds): model_path, model_type, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, model_type=model_type, algo='TREE_REORG', output_class=False, threshold=0.50) X = np.asarray(X) fil_preds = fm.predict(X) fil_preds = np.reshape(fil_preds, np.shape(xgb_preds)) assert array_equal(fil_preds, xgb_preds, 1e-3)
def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth, storage_type, model_class): # skip depth 20 for dense tests if max_depth == 20 and storage_type == 'DENSE': return # settings n_categories = 1 random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_categories, random_state=random_state, classification=False) # identify shape and indices train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) init_kwargs = { 'n_estimators': n_estimators, 'max_depth': max_depth, } if model_class == RandomForestRegressor: init_kwargs['max_features'] = 0.3 init_kwargs['n_jobs'] = -1 else: # model_class == GradientBoostingRegressor init_kwargs['init'] = 'zero' skl_model = model_class(**init_kwargs) skl_model.fit(X_train, y_train) skl_preds = skl_model.predict(X_validation) skl_mse = mean_squared_error(y_validation, skl_preds) algo = 'NAIVE' if storage_type == 'SPARSE' else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, output_class=False, storage_type=storage_type) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(skl_preds)) fil_mse = mean_squared_error(y_validation, fil_preds) assert fil_mse == pytest.approx(skl_mse, 1e-4) assert array_equal(fil_preds, skl_preds)
def test_logistic_regression_predict_proba(dtype, nrows, column_info, num_classes, fit_intercept, sparse_input): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes) X_train = csr_matrix(X_train) if sparse_input else X_train X_test = csr_matrix(X_test) if sparse_input else X_test y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(fit_intercept=fit_intercept, output_type="numpy") culog.fit(X_train, y_train) if num_classes > 2: sklog = skLog(fit_intercept=fit_intercept, solver="lbfgs", multi_class="multinomial") else: sklog = skLog(fit_intercept=fit_intercept) sklog.coef_ = culog.coef_.T if fit_intercept: sklog.intercept_ = culog.intercept_ else: skLog.intercept_ = 0 sklog.classes_ = np.arange(num_classes) cu_proba = culog.predict_proba(X_test) sk_proba = sklog.predict_proba(X_test) cu_log_proba = culog.predict_log_proba(X_test) sk_log_proba = sklog.predict_log_proba(X_test) assert array_equal(cu_proba, sk_proba) assert array_equal(cu_log_proba, sk_log_proba)
def test_sparse_pca_inputs(nrows, ncols, whiten, return_sparse, cupy_input): if ncols == 20000 and pytest.max_gpu_memory < 48: if pytest.adapt_stress_test: ncols = int(ncols * pytest.max_gpu_memory / 48) else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") if return_sparse: pytest.skip("Loss of information in converting to cupy sparse csr") X = cupyx.scipy.sparse.random(nrows, ncols, density=0.07, dtype=cp.float32, random_state=10) if not (cupy_input): X = X.get() p_sparse = cuPCA(n_components=ncols, whiten=whiten) p_sparse.fit(X) t_sparse = p_sparse.transform(X) i_sparse = p_sparse.inverse_transform(t_sparse, return_sparse=return_sparse) if return_sparse: assert isinstance(i_sparse, cupyx.scipy.sparse.csr_matrix) assert array_equal(i_sparse.todense(), X.todense(), 1e-1, with_sign=True) else: if cupy_input: assert isinstance(i_sparse, cp.ndarray) assert array_equal(i_sparse, X.todense(), 1e-1, with_sign=True)
def test_fil_classification(n_rows, n_columns, num_rounds, n_classes, tmp_path): # settings classification = True # change this to false to use regression random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_classes, random_state=random_state, classification=classification) # identify shape and indices n_rows, n_columns = X.shape train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) model_path = os.path.join(tmp_path, 'xgb_class.model') bst = _build_and_save_xgboost(model_path, X_train, y_train, num_rounds=num_rounds, classification=classification, n_classes=n_classes) dvalidation = xgb.DMatrix(X_validation, label=y_validation) if n_classes == 2: xgb_preds = bst.predict(dvalidation) xgb_preds_int = np.around(xgb_preds) xgb_proba = np.stack([1 - xgb_preds, xgb_preds], axis=1) else: xgb_proba = bst.predict(dvalidation) xgb_preds_int = xgb_proba.argmax(axis=1) xgb_acc = accuracy_score(y_validation, xgb_preds_int) fm = ForestInference.load(model_path, algo='auto', output_class=True, threshold=0.50) fil_preds = np.asarray(fm.predict(X_validation)) fil_proba = np.asarray(fm.predict_proba(X_validation)) fil_acc = accuracy_score(y_validation, fil_preds) assert fil_acc == pytest.approx(xgb_acc, abs=0.01) assert array_equal(fil_preds, xgb_preds_int) np.testing.assert_allclose(fil_proba, xgb_proba, atol=proba_atol[n_classes > 2])
def test_elastic_net(datatype, X_type, alpha, algorithm, nrows, ncols, n_info): train_rows = np.int32(nrows * 0.8) X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X_test = np.asarray(X[train_rows:, 0:], dtype=datatype) X_train = np.asarray(X[0:train_rows, :], dtype=datatype) y_train = np.asarray(y[0:train_rows, ], dtype=datatype) elastic_cu = cuElasticNet(alpha=np.array([alpha]), fit_intercept=True, normalize=False, max_iter=1000, selection=algorithm, tol=1e-10) if X_type == 'dataframe': y_train = pd.DataFrame({'fea0': y_train[0:, ]}) X_train = pd.DataFrame( {'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])}) X_test = pd.DataFrame( {'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_train) X_cudf_test = cudf.DataFrame.from_pandas(X_test) y_cudf = y_train.values y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) elastic_cu.fit(X_cudf, y_cudf) cu_predict = elastic_cu.predict(X_cudf_test) elif X_type == 'ndarray': elastic_cu.fit(X_train, y_train) cu_predict = elastic_cu.predict(X_test) if nrows < 500000: elastic_sk = ElasticNet(alpha=np.array([alpha]), fit_intercept=True, normalize=False, max_iter=1000, selection=algorithm, tol=1e-10) elastic_sk.fit(X_train, y_train) sk_predict = elastic_sk.predict(X_test) assert array_equal(sk_predict, cu_predict, 1e-1, with_sign=True)
def test_pca_fit(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import PCA as daskPCA from sklearn.decomposition import PCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=0.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) X = X_cudf.compute().to_pandas().values cupca = daskPCA(n_components=5, whiten=True) cupca.fit(X_cudf) skpca = PCA(n_components=5, whiten=True, svd_solver="full") skpca.fit(X) from cuml.test.utils import array_equal all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] if owns_cluster: client.close() cluster.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_basic_functions(labels, cluster): client = None try: client = Client(cluster) fit_labels, xform_labels = labels s = cp.asarray(fit_labels, dtype=np.int32) df = dask.array.from_array(s) s2 = cp.asarray(xform_labels, dtype=np.int32) df2 = dask.array.from_array(s2) binarizer = LabelBinarizer(client=client, sparse_output=False) binarizer.fit(df) assert array_equal(cp.asnumpy(binarizer.classes_), np.unique(cp.asnumpy(s))) xformed = binarizer.transform(df2) xformed = xformed.map_blocks(lambda x: x.get(), dtype=cp.float32) xformed.compute_chunk_sizes() assert xformed.compute().shape[1] == binarizer.classes_.shape[0] original = binarizer.inverse_transform(xformed) test = original.compute() assert array_equal(cp.asnumpy(test), xform_labels) finally: if client is not None: print("Closing client") client.close()
def test_linear_models_set_params(algo): x = np.linspace(0, 1, 50) y = 2 * x model = algo() model.fit(x, y) coef_before = model.coef_ if algo == cuLog: params = {'penalty': "none", 'C': 1, 'max_iter': 30} model = algo(penalty='none', C=1, max_iter=30) else: model = algo(solver='svd', alpha=0.1) params = {'solver': "svd", 'alpha': 0.1} model.fit(x, y) coef_after = model.coef_ model = algo() model.set_params(**params) model.fit(x, y) coef_test = model.coef_ assert not array_equal(coef_before, coef_after) assert array_equal(coef_after, coef_test)
def test_targetencoder_cupy(): """ Note that there are newly-encountered values in x_test, namely, 3 and 4. """ x_train = cp.array([1, 2, 2, 1]) y_train = cp.array([1, 0, 1, 1]) x_test = cp.array([1, 2, 3, 4]) encoder = TargetEncoder() encoder.fit_transform(x_train, y_train) test_encoded = encoder.transform(x_test) answer = np.array([1., 0.5, 0.75, 0.75]) assert array_equal(test_encoded, answer) print(type(test_encoded)) assert isinstance(test_encoded, cp.ndarray)
def test_tsvd_inverse_transform(datatype, input_type): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': Xcutsvd = cutsvd.fit_transform(gdf) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) Xcutsvd = cutsvd.fit_transform(X) input_gdf = cutsvd.inverse_transform(Xcutsvd) assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
def test_ridge_regression_model_default(datatype): X_train, X_test, y_train, y_test = small_regression_dataset(datatype) curidge = cuRidge() # fit and predict cuml ridge regression model curidge.fit(X_train, y_train) curidge_predict = curidge.predict(X_test) # sklearn ridge regression model initialization, fit and predict skridge = skRidge() skridge.fit(X_train, y_train) skridge_predict = skridge.predict(X_test) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
def test_lightgbm(tmp_path, num_classes): import lightgbm as lgb X, y = simulate_data(500, 10 if num_classes == 2 else 50, num_classes, random_state=43210, classification=True) train_data = lgb.Dataset(X, label=y) if num_classes == 2: param = { 'objective': 'binary', 'metric': 'binary_logloss', 'num_class': 1 } else: param = { 'objective': 'ova', # 'multiclass', would use softmax 'metric': 'multi_logloss', 'num_class': num_classes } num_round = 5 bst = lgb.train(param, train_data, num_round) gbm_preds = bst.predict(X) if num_classes > 2: gbm_preds = gbm_preds.argmax(axis=1) model_path = str(os.path.join(tmp_path, 'lgb.model')) bst.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") fil_preds = fm.predict(X) assert array_equal(np.round(gbm_preds), fil_preds) if num_classes == 2: lcls = lgb.LGBMClassifier().set_params(**param) lcls.fit(X, y) gbm_proba = lcls.predict_proba(X) lcls.booster_.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") fil_proba = fm.predict_proba(X) assert np.allclose(gbm_proba, fil_proba, 1e-2)
def test_monotonic_validate_invert_labels(arr_type, dtype, copy): arr = np.array([0, 15, 10, 50, 20, 50], dtype=dtype) original = arr.copy() if arr_type == "cp": arr = cp.asarray(arr, dtype=dtype) arr_orig = arr.copy() monotonic, mapped_classes = make_monotonic(arr, copy=copy) cp.cuda.Stream.null.synchronize() assert array_equal(monotonic.get(), np.array([0, 2, 1, 4, 3, 4])) # We only care about in-place updating if data is on device if arr_type == "cp": if copy: assert array_equal(arr_orig.get(), arr.get()) else: assert array_equal(arr.get(), monotonic.get()) wrong_classes = cp.asarray([0, 1, 2], dtype=dtype) val_labels = check_labels(monotonic.get(), classes=wrong_classes) cp.cuda.Stream.null.synchronize() assert not val_labels correct_classes = cp.asarray([0, 1, 2, 3, 4], dtype=dtype) val_labels = check_labels(monotonic.get(), classes=correct_classes) cp.cuda.Stream.null.synchronize() assert val_labels if arr_type == "cp": monotonic_copy = monotonic.copy() inverted = invert_labels(monotonic, classes=cp.asarray([0, 10, 15, 20, 50], dtype=dtype), copy=copy) cp.cuda.Stream.null.synchronize() if arr_type == "cp": if copy: assert array_equal(monotonic_copy.get(), monotonic.get()) else: assert array_equal(monotonic.get(), arr_orig.get()) assert array_equal(inverted.get(), original)
def test_targetencoder_pandas(): """ Note that there are newly-encountered values in test, namely, 'c' and 'd'. """ train = pandas.DataFrame({ 'category': ['a', 'b', 'b', 'a'], 'label': [1, 0, 1, 1] }) test = pandas.DataFrame({'category': ['c', 'b', 'a', 'd']}) encoder = TargetEncoder() encoder.fit_transform(train.category, train.label) test_encoded = encoder.transform(test.category) answer = np.array([0.75, 0.5, 1., 0.75]) assert array_equal(test_encoded, answer) print(type(test_encoded)) assert isinstance(test_encoded, np.ndarray)
def test_pca_inverse_transform(datatype, input_type): gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cupca = cuPCA(n_components=2) if input_type == 'dataframe': Xcupca = cupca.fit_transform(gdf) else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) Xcupca = cupca.fit_transform(X) input_gdf = cupca.inverse_transform(Xcupca) assert array_equal(input_gdf, gdf, 1e-3, with_sign=True)
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cupca.fit(X_cudf) else: cupca.fit(X) cupca.handle.sync() for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'noise_variance_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) if isinstance(cuml_res, cudf.Series): cuml_res = cuml_res.to_array() else: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit(nrows, ncols, n_parts, cluster): client = Client(cluster) try: from cuml.dask.decomposition import PCA as daskPCA from sklearn.decomposition import PCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=0.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) print(str(X_cudf.head(3))) try: cupca = daskPCA(n_components=5, whiten=True) cupca.fit(X_cudf) except Exception as e: print(str(e)) X = X_cudf.compute().to_pandas().values skpca = PCA(n_components=5, whiten=True, svd_solver="full") skpca.fit(X) from cuml.test.utils import array_equal all_attr = ['singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_'] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign) finally: client.close()
def test_tsvd_fit_transform(datatype, input_type): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) cutsvd = cuTSVD(n_components=1) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) Xcutsvd = cutsvd.fit_transform(gdf) else: Xcutsvd = cutsvd.fit_transform(X) assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def test_linear_regression_model_default(datatype): X_train, X_test, y_train, y_test = small_regression_dataset(datatype) # Initialization of cuML's linear regression model cuols = cuLinearRegression() # fit and predict cuml linear regression model cuols.fit(X_train, y_train) cuols_predict = cuols.predict(X_test) # sklearn linear regression model initialization and fit skols = skLinearRegression() skols.fit(X_train, y_train) skols_predict = skols.predict(X_test) assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
def test_targetencoder_random(n_samples, dtype): x = cp.random.randint(0, 1000, n_samples).astype(dtype) y = cp.random.randint(0, 2, n_samples).astype(dtype) xt = cp.random.randint(0, 1000, n_samples).astype(dtype) encoder = TargetEncoder() encoder.fit_transform(x, y) test_encoded = encoder.transform(xt) df_train = cudf.DataFrame({'x': x, 'y': y}) dg = df_train.groupby('x', as_index=False).agg({'y': 'mean'}) df_test = cudf.DataFrame({'x': xt}) df_test['row_id'] = cp.arange(len(df_test)) df_test = df_test.merge(dg, on='x', how='left') df_test = df_test.sort_values('row_id') answer = df_test['y'].fillna(cp.mean(y).item()).values assert array_equal(test_encoded, answer)
def test_ann_distances_metrics(algo, metric): X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0) cu_knn = cuKNN(algorithm=algo, metric=metric) cu_knn.fit(X) cu_dist, cu_ind = cu_knn.kneighbors(X, n_neighbors=10, return_distance=True) del cu_knn gc.collect() X = X.get() sk_knn = skKNN(metric=metric) sk_knn.fit(X) sk_dist, sk_ind = sk_knn.kneighbors(X, n_neighbors=10, return_distance=True) return array_equal(sk_dist, cu_dist)
def test_ivfflat_pred(nrows, ncols, n_neighbors, nlist): algo_params = {'nlist': nlist, 'nprobe': nlist * 0.25} X, y = make_blobs(n_samples=nrows, centers=5, n_features=ncols, random_state=0) knn_cu = cuKNN(algorithm="ivfflat", algo_params=algo_params) knn_cu.fit(X) neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors, return_distance=False) del knn_cu gc.collect() labels, probs = predict(neigh_ind, y, n_neighbors) assert array_equal(labels, y)
def test_cov(nrows, ncols, sparse, dtype): if sparse: x = cupyx.scipy.sparse.random(nrows, ncols, density=0.07, format='csr', dtype=dtype) else: x = cp.random.random((nrows, ncols), dtype=dtype) cov_result = cov(x, x) assert cov_result.shape == (ncols, ncols) if sparse: x = x.todense() local_cov = cp.cov(x, rowvar=False, ddof=0) assert array_equal(cov_result, local_cov, 1e-6, with_sign=True)
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): if nrows == 500000 and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32 else: pytest.skip("Insufficient GPU memory for this test. " "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) eps = 1 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64