def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch): # max_bytes_per_batch sizes: 10=6 batches, 200=2 batches, 2e6=1 batch X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype=datatype) skdbscan = skDBSCAN(eps=3, min_samples=2) sk_labels = skdbscan.fit_predict(X) handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=3, min_samples=2, max_bytes_per_batch=max_bytes_per_batch) if input_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype) gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype) cu_labels = cudbscan.fit_predict(gdf) else: cu_labels = cudbscan.fit_predict(X) cudbscan.handle.sync() for i in range(X.shape[0]): assert cu_labels[i] == sk_labels[i]
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples): X, y = make_blobs(n_samples, centers=1, cluster_std=8.0, center_box=(-100.0, 100.0), random_state=8) X = X.astype(datatype) handle, stream = get_handle(use_handle) eps = 0.5 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_tsvd_fit(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': sktsvd = skTSVD(n_components=1) sktsvd.fit(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) cutsvd.fit(X) cutsvd.handle.sync() if name != 'blobs': for attr in ['singular_values_', 'components_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def test_tsvd_fit_transform(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) Xcutsvd = cutsvd.fit_transform(X) cutsvd.handle.sync() if name != 'blobs': assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def test_dbscan(datatype, input_type, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch) if input_type == 'dataframe': X = pd.DataFrame( {'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cu_labels = cudbscan.fit_predict(X_cudf, out_dtype=out_dtype) else: cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = skdbscan.fit_predict(X) score = adjusted_rand_score(cu_labels, sk_labels) assert score == 1 if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_accuracy(nrows, ncols, n_info, datatype): use_handle = True train_rows = np.int32(nrows * 0.8) X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=5) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(np.int32) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=1.0, n_bins=8, split_algo=0, split_criterion=0, min_rows_per_node=2, n_estimators=40, handle=handle, max_leaves=-1, max_depth=-1) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) cu_acc = cu_acc_score(y_test, cu_predict) cu_acc_using_sk = sk_acc_score(y_test, cu_predict) # compare the accuracy of the two models assert array_equal(cu_acc, cu_acc_using_sk)
def test_pca_fit_transform(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) if name != 'blobs': skpca = skPCA(n_components=2) Xskpca = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) X_cupca = cupca.fit_transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True) assert Xskpca.shape[0] == X_cupca.shape[0] assert Xskpca.shape[1] == X_cupca.shape[1]
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = skdbscan.fit_predict(X) score = adjusted_rand_score(cu_labels, sk_labels) assert score == 1 if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) eps = 1 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'digits': X, _ = datasets.load_digits(return_X_y=True) else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) cupca.fit(X) cupca.handle.sync() for attr in ['singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_']: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_pca_inverse_transform(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_pd) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X_cupca = cupca.fit_transform(X_cudf) else: X_cupca = cupca.fit_transform(X) input_gdf = cupca.inverse_transform(X_cupca) cupca.handle.sync() assert array_equal(input_gdf, X, 1e-0, with_sign=True)
def test_pca_fit_transform(datatype, input_type, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) if name != 'blobs': skpca = skPCA(n_components=2) Xskpca = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) X_cupca = cupca.fit_transform(X_cudf) else: X_cupca = cupca.fit_transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(n_bins=16, split_criterion=2, min_rows_per_node=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() input_type = 'numpy' fil_model_preds = fil_model.predict(X_test, output_type=input_type) fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=40, min_samples_split=2, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07)
def test_rf_classification(small_clf, datatype, split_algo, max_samples, max_features, use_experimental_backend): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, max_samples=max_samples, n_bins=16, split_algo=split_algo, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16, use_experimental_backend=use_experimental_backend) f = io.StringIO() with redirect_stdout(f): cuml_model.fit(X_train, y_train) captured_stdout = f.getvalue() if use_experimental_backend: is_fallback_used = False if split_algo != 1: assert ('Experimental backend does not yet support histogram ' + 'split algorithm' in captured_stdout) is_fallback_used = True if is_fallback_used: assert ('Not using the experimental backend due to above ' + 'mentioned reason(s)' in captured_stdout) else: assert ('Using experimental backend for growing trees' in captured_stdout) else: assert captured_stdout == '' fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='auto') cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def rf_classification(datatype, array_type, max_features, max_samples, fixture): X, y = fixture X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) handle, stream = get_handle(True, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, max_samples=max_samples, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) if array_type == 'dataframe': X_train_df = cudf.DataFrame(X_train) y_train_df = cudf.Series(y_train) X_test_df = cudf.DataFrame(X_test) cuml_model.fit(X_train_df, y_train_df) cu_proba_gpu = np.array(cuml_model.predict_proba(X_test_df) .as_gpu_matrix()) cu_preds_cpu = cuml_model.predict(X_test_df, predict_model="CPU").to_array() cu_preds_gpu = cuml_model.predict(X_test_df, predict_model="GPU").to_array() else: cuml_model.fit(X_train, y_train) cu_proba_gpu = cuml_model.predict_proba(X_test) cu_preds_cpu = cuml_model.predict(X_test, predict_model="CPU") cu_preds_gpu = cuml_model.predict(X_test, predict_model="GPU") np.testing.assert_array_equal(cu_preds_gpu, np.argmax(cu_proba_gpu, axis=1)) cu_acc_cpu = accuracy_score(y_test, cu_preds_cpu) cu_acc_gpu = accuracy_score(y_test, cu_preds_gpu) assert cu_acc_cpu == pytest.approx(cu_acc_gpu, abs=0.01, rel=0.1) # sklearn random forest classification model # initialization, fit and predict if y.size < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) sk_proba = sk_model.predict_proba(X_test) assert cu_acc_cpu >= sk_acc - 0.07 assert cu_acc_gpu >= sk_acc - 0.07 # 0.06 is the highest relative error observed on CI, within # 0.0061 absolute error boundaries seen previously check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)
def test_rf_classification_proba(datatype, split_algo, rows_sample, nrows, column_info, max_features): use_handle = True ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds_proba = cuml_model.predict_proba(X_test, output_class=True, threshold=0.5, algo='auto') y_proba = np.zeros(np.shape(fil_preds_proba)) y_proba[:, 1] = y_test y_proba[:, 0] = 1.0 - y_test fil_mse = mean_squared_error(y_proba, fil_preds_proba) if nrows < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds_proba = sk_model.predict_proba(X_test) sk_mse = mean_squared_error(y_proba, sk_preds_proba) # Max difference of 0.0061 is seen between the mse values of # predict proba function of fil and sklearn assert fil_mse <= (sk_mse + 0.0061)
def test_rf_classification(datatype, split_algo, rows_sample, nrows, column_info, max_features): use_handle = True ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='BATCH_TREE_REORG') cu_predict = cuml_model.predict(X_test, predict_model="CPU") cuml_acc = accuracy_score(y_test, cu_predict) fil_acc = accuracy_score(y_test, fil_preds) if nrows < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def test_rf_regression( special_reg, datatype, max_features, max_samples, n_bins ): use_handle = True X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr( max_features=max_features, max_samples=max_samples, n_bins=n_bins, split_criterion=2, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric="mse", ) cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress" sk_model = skrfr( n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
def test_r2_score(datatype, use_handle): a = np.array([0.1, 0.2, 0.3, 0.4, 0.5], dtype=datatype) b = np.array([0.12, 0.22, 0.32, 0.42, 0.52], dtype=datatype) a_dev = cuda.to_device(a) b_dev = cuda.to_device(b) handle, stream = get_handle(use_handle) score = cuml.metrics.r2_score(a_dev, b_dev, handle=handle) np.testing.assert_almost_equal(score, 0.98, decimal=7)
def test_entropy(use_handle): handle, stream = get_handle(use_handle) # The outcome of a fair coin is the most uncertain: # in base 2 the result is 1 (One bit of entropy). cluster = np.array([0, 1], dtype=np.int32) assert_almost_equal(entropy(cluster, base=2., handle=handle), 1.) # The outcome of a biased coin is less uncertain: cluster = np.array(([0] * 9) + [1], dtype=np.int32) assert_almost_equal(entropy(cluster, base=2., handle=handle), 0.468995593) # base e assert_almost_equal(entropy(cluster, handle=handle), 0.32508297339144826)
def test_rf_printing(capfd, n_estimators, detailed_printing): X, y = make_classification(n_samples=500, n_features=10, n_clusters_per_class=1, n_informative=5, random_state=94929, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(True, n_streams=1) # Initialize cuML Random Forest classification model cuml_model = curfc(handle=handle, max_features=1.0, rows_sample=1.0, n_bins=16, split_algo=0, split_criterion=0, min_rows_per_node=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=16) # Train model on the data cuml_model.fit(X, y) if detailed_printing: cuml_model.print_detailed() else: cuml_model.print_summary() # Read the captured output printed_output = capfd.readouterr().out # Test 1: Output is non-zero assert '' != printed_output # Count the number of trees printed tree_count = 0 for line in printed_output.split('\n'): if line.strip().startswith('Tree #'): tree_count += 1 # Test 2: Correct number of trees are printed assert n_estimators == tree_count
def test_rf_get_text(n_estimators, detailed_text): X, y = make_classification( n_samples=500, n_features=10, n_clusters_per_class=1, n_informative=5, random_state=94929, n_classes=2, ) X = X.astype(np.float32) y = y.astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(True, n_streams=1) # Initialize cuML Random Forest classification model cuml_model = curfc( handle=handle, max_features=1.0, max_samples=1.0, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=16, ) # Train model on the data cuml_model.fit(X, y) if detailed_text: text_output = cuml_model.get_detailed_text() else: text_output = cuml_model.get_summary_text() # Test 1: Output is non-zero assert "" != text_output # Count the number of trees printed tree_count = 0 for line in text_output.split("\n"): if line.strip().startswith("Tree #"): tree_count += 1 # Test 2: Correct number of trees are printed assert n_estimators == tree_count
def test_dbscan_predict_multiple_streams(): datatype = np.float32 gdf = cudf.DataFrame() gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype) gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype) X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype=datatype) skdbscan = skDBSCAN(eps=3, min_samples=2) sk_labels = skdbscan.fit_predict(X) handle1, stream1 = get_handle(True) handle2, stream2 = get_handle(True) cudbscan1 = cuDBSCAN(handle=handle1, eps=3, min_samples=2) cudbscan2 = cuDBSCAN(handle=handle2, eps=3, min_samples=2) cu_labels1 = cudbscan1.fit_predict(gdf) cu_labels2 = cudbscan2.fit_predict(gdf) cudbscan1.handle.sync() cudbscan2.handle.sync() for i in range(X.shape[0]): assert cu_labels1[i] == sk_labels[i] assert cu_labels2[i] == sk_labels[i]
def test_rf_classification(small_clf, datatype, max_samples, max_features): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc( max_features=max_features, max_samples=max_samples, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16, ) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict( X_test, predict_model="GPU", threshold=0.5, algo="auto" ) cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc( n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
def test_rf_classification(small_clf, datatype, split_algo, rows_sample, max_features): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='auto') cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def test_rf_regression(datatype, split_algo, rows_sample, n_info, mode, ncols, max_features): use_handle = True if mode == 'unit': X, y = make_regression(n_samples=100, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) train_rows = np.int32(X.shape[0]*0.8) X_test = np.asarray(X[train_rows:, :]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(datatype) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(datatype) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=8) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds) fil_r2 = r2_score(y_test, fil_preds) # Initialize, fit and predict using # sklearn's random forest regression model sk_model = skrfr(n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_predict) print(fil_r2, cu_r2, sk_r2) assert fil_r2 >= (cu_r2 - 0.02) assert fil_r2 >= (sk_r2 - 0.07)
def test_entropy_random(n_samples, base, use_handle): handle, stream = get_handle(use_handle) clustering, _ = \ generate_random_labels(lambda rng: rng.randint(0, 1000, n_samples)) # generate unormalized probabilities from clustering pk = np.bincount(clustering) # scipy's entropy uses probabilities sp_S = sp_entropy(pk, base=base) # we use a clustering S = entropy(np.array(clustering, dtype=np.int32), base, handle=handle) assert_almost_equal(S, sp_S, decimal=2)
def test_rf_regression(datatype, use_handle, split_algo, n_info, mode, ncols, rows_sample): if mode == 'unit': X, y = make_regression(n_samples=30, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) train_rows = np.int32(X.shape[0]*0.8) X_test = np.asarray(X[train_rows:, :]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(datatype) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(datatype) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr(max_features=1.0, rows_sample=rows_sample, n_bins=8, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, n_estimators=50, handle=handle, max_leaves=-1, max_depth=25, accuracy_metric='mse') cuml_model.fit(X_train, y_train) cu_mse = cuml_model.score(X_test, y_test) if mode != 'stress': # sklearn random forest classification model # initialization, fit and predict sk_model = skrfr(n_estimators=50, max_depth=50, min_samples_split=2, max_features=1.0, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_mse = mean_squared_error(y_test, sk_predict) # compare the accuracy of the two models assert cu_mse <= (sk_mse + 0.07)
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) if input_type == 'dataframe': X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) cupca.fit(X_cudf) else: cupca.fit(X) cupca.handle.sync() for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_', 'noise_variance_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) if isinstance(cuml_res, cudf.Series): cuml_res = cuml_res.to_array() else: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_dbscan_predict_numpy(datatype, use_handle): gdf = cudf.DataFrame() gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype) gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype) X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype=datatype) print("Calling fit_predict") handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=3, min_samples=2) cu_labels = cudbscan.fit_predict(gdf) skdbscan = skDBSCAN(eps=3, min_samples=2) sk_labels = skdbscan.fit_predict(X) print(X.shape[0]) cudbscan.handle.sync() for i in range(X.shape[0]): assert cu_labels[i] == sk_labels[i]