def test_single_input(client, model_type, ignore_empty_partitions): X, y = make_classification(n_samples=1) X = X.astype(np.float32) if model_type == 'classification': y = y.astype(np.int32) else: y = y.astype(np.float32) X, y = _prep_training_data(client, X, y, partitions_per_worker=2) if model_type == 'classification': cu_rf_mg = cuRFC_mg(n_bins=1, ignore_empty_partitions=ignore_empty_partitions) else: cu_rf_mg = cuRFR_mg(n_bins=1, ignore_empty_partitions=ignore_empty_partitions) if ignore_empty_partitions or \ len(client.scheduler_info()['workers'].keys()) == 1: cu_rf_mg.fit(X, y) cuml_mod_predict = cu_rf_mg.predict(X) cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute())) y = cp.asnumpy(cp.array(y.compute())) acc_score = accuracy_score(cuml_mod_predict, y) assert acc_score == 1.0 else: with pytest.raises(ValueError): cu_rf_mg.fit(X, y)
def test_rf_broadcast(model_type, fit_broadcast, transform_broadcast, client): # Use CUDA_VISIBLE_DEVICES to control the number of workers workers = list(client.scheduler_info()['workers'].keys()) n_workers = len(workers) if model_type == 'classification': X, y = make_classification(n_samples=n_workers * 1000, n_features=20, n_informative=15, n_classes=4, n_clusters_per_class=1, random_state=123) y = y.astype(np.int32) else: X, y = make_regression(n_samples=n_workers * 1000, n_features=20, n_informative=5, random_state=123) y = y.astype(np.float32) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_workers * 100, random_state=123) X_train_df, y_train_df = _prep_training_data(client, X_train, y_train, 1) X_test_dask_array = from_array(X_test) if model_type == 'classification': cuml_mod = cuRFC_mg(n_estimators=10, max_depth=8, n_bins=16, ignore_empty_partitions=True) cuml_mod.fit(X_train_df, y_train_df, broadcast_data=fit_broadcast) cuml_mod_predict = cuml_mod.predict(X_test_dask_array, broadcast_data=transform_broadcast) cuml_mod_predict = cuml_mod_predict.compute() cuml_mod_predict = cp.asnumpy(cuml_mod_predict) acc_score = accuracy_score(cuml_mod_predict, y_test, normalize=True) assert acc_score >= 0.72 else: cuml_mod = cuRFR_mg(n_estimators=10, max_depth=8, n_bins=16, ignore_empty_partitions=True) cuml_mod.fit(X_train_df, y_train_df, broadcast_data=fit_broadcast) cuml_mod_predict = cuml_mod.predict(X_test_dask_array, broadcast_data=transform_broadcast) cuml_mod_predict = cuml_mod_predict.compute() cuml_mod_predict = cp.asnumpy(cuml_mod_predict) acc_score = r2_score(cuml_mod_predict, y_test) assert acc_score >= 0.72 if transform_broadcast: assert cuml_mod.internal_model is None
def test_rf_concatenation_dask(client, model_type): from cuml.fil.fil import TreeliteModel X, y = make_classification(n_samples=1000, n_features=30, random_state=123, n_classes=2) X = X.astype(np.float32) if model_type == 'classification': y = y.astype(np.int32) else: y = y.astype(np.float32) n_estimators = 40 cu_rf_params = {'n_estimators': n_estimators} X_df, y_df = _prep_training_data(client, X, y, partitions_per_worker=2) if model_type == 'classification': cu_rf_mg = cuRFC_mg(**cu_rf_params) else: cu_rf_mg = cuRFR_mg(**cu_rf_params) cu_rf_mg.fit(X_df, y_df) res1 = cu_rf_mg.predict(X_df) res1.compute() local_tl = TreeliteModel.from_treelite_model_handle( cu_rf_mg.internal_model._obtain_treelite_handle(), take_handle_ownership=False) assert local_tl.num_trees == n_estimators
def test_rf_get_combined_model_right_aftter_fit(client, estimator_type): max_depth = 3 n_estimators = 5 X, y = make_classification() X = X.astype(np.float32) if estimator_type == 'classification': cu_rf_mg = cuRFC_mg(max_features=1.0, max_samples=1.0, n_bins=16, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) elif estimator_type == 'regression': cu_rf_mg = cuRFR_mg(max_features=1.0, max_samples=1.0, n_bins=16, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.float32) else: assert False X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg.fit(X_dask, y_dask) single_gpu_model = cu_rf_mg.get_combined_model() if estimator_type == 'classification': assert isinstance(single_gpu_model, cuRFC_sg) elif estimator_type == 'regression': assert isinstance(single_gpu_model, cuRFR_sg) else: assert False
def test_rf_regression_dask_fil(partitions_per_worker, cluster): # Use CUDA_VISIBLE_DEVICES to control the number of workers c = Client(cluster) try: X, y = make_regression(n_samples=10000, n_features=20, n_informative=10, random_state=123) X = X.astype(np.float32) y = y.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 50, 'max_depth': 16, 'n_bins': 16, } workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = np.array(pd.DataFrame(y_train).values) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test)) X_test_df = \ dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( c, [X_train_df, y_train_df], workers=workers) cu_rf_mg = cuRFR_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) cu_rf_mg_predict = cu_rf_mg.predict(X_test_df).compute() cu_rf_mg_predict = cp.asnumpy(cp.array(cu_rf_mg_predict)) acc_score = r2_score(cu_rf_mg_predict, y_test) assert acc_score >= 0.67 finally: c.close()
def test_rf_regression_dask_fil(partitions_per_worker, dtype, client): n_workers = len(client.scheduler_info()['workers']) # Use CUDA_VISIBLE_DEVICES to control the number of workers X, y = make_regression(n_samples=n_workers * 4000, n_features=20, n_informative=10, random_state=123) X = X.astype(dtype) y = y.astype(dtype) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=n_workers * 100, random_state=123) if dtype == np.float64: pytest.xfail(reason=" Dask RF does not support np.float64 data") cu_rf_params = { 'n_estimators': 50, 'max_depth': 16, 'n_bins': 16, } workers = client.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = cudf.Series(y_train) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test)) X_test_df = \ dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions) cuml_mod = cuRFR_mg(**cu_rf_params, ignore_empty_partitions=True) cuml_mod.fit(X_train_df, y_train_df) cuml_mod_predict = cuml_mod.predict(X_test_df) cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute())) acc_score = r2_score(cuml_mod_predict, y_test) assert acc_score >= 0.67
def test_rf_throws_exceptions(cluster): c = Client(cluster) try: cu_rf_params = {'n_estimators': 10, 'max_depth': 8} cu_rf_mg = cuRFR_mg(**cu_rf_params) X_train, y_train = make_regression(n_samples=100, n_features=20, n_informative=10, random_state=123) X_train = X_train.astype(np.float32) X_train_df, y_train_df = _prep_training_data(c, X_train, y_train, 1) cu_rf_mg.fit(X_train_df, y_train_df) with pytest.raises(RuntimeError): cu_rf_mg.fit(X_train_df, y_train_df) finally: c.close()
def test_rf_regression(n_workers, partitions_per_worker): if dask_cuda.utils.get_n_gpus() < n_workers: pytest.skip("too few GPUs") cluster = LocalCUDACluster(threads_per_worker=1, n_workers=n_workers) c = Client(cluster) X, y = make_regression(n_samples=40000, n_features=20, n_informative=10, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 25, 'max_depth': 13, } workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = np.array(pd.DataFrame(y_train).values) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( c, [X_train_df, y_train_df], workers=workers) cu_rf_mg = cuRFR_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) cu_rf_mg_predict = cu_rf_mg.predict(X_test) acc_score = r2_score(cu_rf_mg_predict, y_test) print(str(acc_score)) assert acc_score >= 0.70 c.close() cluster.close()
def test_rf_regression_dask_cpu(partitions_per_worker, client): n_workers = len(client.scheduler_info()['workers']) X, y = make_regression(n_samples=n_workers * 2000, n_features=20, n_informative=10, random_state=123) X = X.astype(np.float32) y = y.astype(np.float32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=n_workers * 400, random_state=123) cu_rf_params = { 'n_estimators': 50, 'max_depth': 16, 'n_bins': 16, } workers = client.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = cudf.Series(y_train) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( client, [X_train_df, y_train_df], workers=workers) cuml_mod = cuRFR_mg(**cu_rf_params) cuml_mod.fit(X_train_df, y_train_df) cuml_mod_predict = cuml_mod.predict(X_test, predict_model='CPU') acc_score = r2_score(cuml_mod_predict, y_test) assert acc_score >= 0.67
def test_single_input_regression(client, ignore_empty_partitions): X, y = make_classification(n_samples=1, n_classes=1) X = X.astype(np.float32) y = y.astype(np.float32) X, y = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg = cuRFR_mg(n_bins=1, ignore_empty_partitions=ignore_empty_partitions) if ignore_empty_partitions or \ len(client.scheduler_info()['workers'].keys()) == 1: cu_rf_mg.fit(X, y) cuml_mod_predict = cu_rf_mg.predict(X) cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute())) y = cp.asnumpy(cp.array(y.compute())) assert y[0] == cuml_mod_predict[0] else: with pytest.raises(ValueError): cu_rf_mg.fit(X, y)
def test_rf_get_combined_model_right_aftter_fit(client, estimator_type): max_depth = 3 n_estimators = 5 n_workers = len(client.scheduler_info()['workers']) if n_estimators < n_workers: err_msg = "n_estimators cannot be lower than number of dask workers" pytest.xfail(err_msg) X, y = make_classification() X = X.astype(np.float32) if estimator_type == 'classification': cu_rf_mg = cuRFC_mg(max_features=1.0, max_samples=1.0, n_bins=16, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) elif estimator_type == 'regression': cu_rf_mg = cuRFR_mg(max_features=1.0, max_samples=1.0, n_bins=16, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.float32) else: assert False X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg.fit(X_dask, y_dask) single_gpu_model = cu_rf_mg.get_combined_model() if estimator_type == 'classification': assert isinstance(single_gpu_model, cuRFC_sg) elif estimator_type == 'regression': assert isinstance(single_gpu_model, cuRFR_sg) else: assert False
def test_rf_get_json(client, estimator_type, max_depth, n_estimators): n_workers = len(client.scheduler_info()['workers']) if n_estimators < n_workers: err_msg = "n_estimators cannot be lower than number of dask workers" pytest.xfail(err_msg) X, y = make_classification(n_samples=350, n_features=20, n_clusters_per_class=1, n_informative=10, random_state=123, n_classes=2) X = X.astype(np.float32) if estimator_type == 'classification': cu_rf_mg = cuRFC_mg(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=0, split_criterion=0, min_samples_leaf=2, seed=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) elif estimator_type == 'regression': cu_rf_mg = cuRFR_mg(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=0, min_samples_leaf=2, seed=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.float32) else: assert False X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg.fit(X_dask, y_dask) json_out = cu_rf_mg.get_json() json_obj = json.loads(json_out) # Test 1: Output is non-zero assert '' != json_out # Test 2: JSON object contains correct number of trees assert isinstance(json_obj, list) assert len(json_obj) == n_estimators # Test 3: Traverse JSON trees and get the same predictions as cuML RF def predict_with_json_tree(tree, x): if 'children' not in tree: assert 'leaf_value' in tree return tree['leaf_value'] assert 'split_feature' in tree assert 'split_threshold' in tree assert 'yes' in tree assert 'no' in tree if x[tree['split_feature']] <= tree['split_threshold']: return predict_with_json_tree(tree['children'][0], x) return predict_with_json_tree(tree['children'][1], x) def predict_with_json_rf_classifier(rf, x): # Returns the class with the highest vote. If there is a tie, return # the list of all classes with the highest vote. vote = [] for tree in rf: vote.append(predict_with_json_tree(tree, x)) vote = np.bincount(vote) max_vote = np.max(vote) majority_vote = np.nonzero(np.equal(vote, max_vote))[0] return majority_vote def predict_with_json_rf_regressor(rf, x): pred = 0. for tree in rf: pred += predict_with_json_tree(tree, x) return pred / len(rf) if estimator_type == 'classification': expected_pred = cu_rf_mg.predict(X_dask).astype(np.int32) expected_pred = expected_pred.compute().to_array() for idx, row in enumerate(X): majority_vote = predict_with_json_rf_classifier(json_obj, row) assert expected_pred[idx] in majority_vote elif estimator_type == 'regression': expected_pred = cu_rf_mg.predict(X_dask).astype(np.float32) expected_pred = expected_pred.compute().to_array() pred = [] for idx, row in enumerate(X): pred.append(predict_with_json_rf_regressor(json_obj, row)) pred = np.array(pred, dtype=np.float32) np.testing.assert_almost_equal(pred, expected_pred, decimal=6)