Пример #1
0
def test_single_input(client, model_type, ignore_empty_partitions):
    X, y = make_classification(n_samples=1)
    X = X.astype(np.float32)
    if model_type == 'classification':
        y = y.astype(np.int32)
    else:
        y = y.astype(np.float32)

    X, y = _prep_training_data(client, X, y, partitions_per_worker=2)
    if model_type == 'classification':
        cu_rf_mg = cuRFC_mg(n_bins=1,
                            ignore_empty_partitions=ignore_empty_partitions)
    else:
        cu_rf_mg = cuRFR_mg(n_bins=1,
                            ignore_empty_partitions=ignore_empty_partitions)

    if ignore_empty_partitions or \
       len(client.scheduler_info()['workers'].keys()) == 1:
        cu_rf_mg.fit(X, y)
        cuml_mod_predict = cu_rf_mg.predict(X)
        cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute()))

        y = cp.asnumpy(cp.array(y.compute()))

        acc_score = accuracy_score(cuml_mod_predict, y)

        assert acc_score == 1.0

    else:
        with pytest.raises(ValueError):
            cu_rf_mg.fit(X, y)
Пример #2
0
def test_rf_broadcast(model_type, fit_broadcast, transform_broadcast, client):
    # Use CUDA_VISIBLE_DEVICES to control the number of workers
    workers = list(client.scheduler_info()['workers'].keys())
    n_workers = len(workers)

    if model_type == 'classification':
        X, y = make_classification(n_samples=n_workers * 1000,
                                   n_features=20,
                                   n_informative=15,
                                   n_classes=4,
                                   n_clusters_per_class=1,
                                   random_state=123)
        y = y.astype(np.int32)
    else:
        X, y = make_regression(n_samples=n_workers * 1000,
                               n_features=20,
                               n_informative=5,
                               random_state=123)
        y = y.astype(np.float32)
    X = X.astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=n_workers *
                                                        100,
                                                        random_state=123)

    X_train_df, y_train_df = _prep_training_data(client, X_train, y_train, 1)
    X_test_dask_array = from_array(X_test)

    if model_type == 'classification':
        cuml_mod = cuRFC_mg(n_estimators=10,
                            max_depth=8,
                            n_bins=16,
                            ignore_empty_partitions=True)
        cuml_mod.fit(X_train_df, y_train_df, broadcast_data=fit_broadcast)
        cuml_mod_predict = cuml_mod.predict(X_test_dask_array,
                                            broadcast_data=transform_broadcast)

        cuml_mod_predict = cuml_mod_predict.compute()
        cuml_mod_predict = cp.asnumpy(cuml_mod_predict)
        acc_score = accuracy_score(cuml_mod_predict, y_test, normalize=True)
        assert acc_score >= 0.72

    else:
        cuml_mod = cuRFR_mg(n_estimators=10,
                            max_depth=8,
                            n_bins=16,
                            ignore_empty_partitions=True)
        cuml_mod.fit(X_train_df, y_train_df, broadcast_data=fit_broadcast)
        cuml_mod_predict = cuml_mod.predict(X_test_dask_array,
                                            broadcast_data=transform_broadcast)

        cuml_mod_predict = cuml_mod_predict.compute()
        cuml_mod_predict = cp.asnumpy(cuml_mod_predict)
        acc_score = r2_score(cuml_mod_predict, y_test)
        assert acc_score >= 0.72

    if transform_broadcast:
        assert cuml_mod.internal_model is None
Пример #3
0
def test_rf_concatenation_dask(client, model_type):
    from cuml.fil.fil import TreeliteModel
    X, y = make_classification(n_samples=1000,
                               n_features=30,
                               random_state=123,
                               n_classes=2)

    X = X.astype(np.float32)
    if model_type == 'classification':
        y = y.astype(np.int32)
    else:
        y = y.astype(np.float32)
    n_estimators = 40
    cu_rf_params = {'n_estimators': n_estimators}

    X_df, y_df = _prep_training_data(client, X, y, partitions_per_worker=2)

    if model_type == 'classification':
        cu_rf_mg = cuRFC_mg(**cu_rf_params)
    else:
        cu_rf_mg = cuRFR_mg(**cu_rf_params)

    cu_rf_mg.fit(X_df, y_df)
    res1 = cu_rf_mg.predict(X_df)
    res1.compute()
    local_tl = TreeliteModel.from_treelite_model_handle(
        cu_rf_mg.internal_model._obtain_treelite_handle(),
        take_handle_ownership=False)

    assert local_tl.num_trees == n_estimators
Пример #4
0
def test_rf_get_combined_model_right_aftter_fit(client, estimator_type):
    max_depth = 3
    n_estimators = 5
    X, y = make_classification()
    X = X.astype(np.float32)
    if estimator_type == 'classification':
        cu_rf_mg = cuRFC_mg(max_features=1.0,
                            max_samples=1.0,
                            n_bins=16,
                            n_streams=1,
                            n_estimators=n_estimators,
                            max_leaves=-1,
                            max_depth=max_depth)
        y = y.astype(np.int32)
    elif estimator_type == 'regression':
        cu_rf_mg = cuRFR_mg(max_features=1.0,
                            max_samples=1.0,
                            n_bins=16,
                            n_streams=1,
                            n_estimators=n_estimators,
                            max_leaves=-1,
                            max_depth=max_depth)
        y = y.astype(np.float32)
    else:
        assert False
    X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2)
    cu_rf_mg.fit(X_dask, y_dask)
    single_gpu_model = cu_rf_mg.get_combined_model()
    if estimator_type == 'classification':
        assert isinstance(single_gpu_model, cuRFC_sg)
    elif estimator_type == 'regression':
        assert isinstance(single_gpu_model, cuRFR_sg)
    else:
        assert False
Пример #5
0
def test_rf_regression_dask_fil(partitions_per_worker, cluster):

    # Use CUDA_VISIBLE_DEVICES to control the number of workers
    c = Client(cluster)

    try:

        X, y = make_regression(n_samples=10000,
                               n_features=20,
                               n_informative=10,
                               random_state=123)

        X = X.astype(np.float32)
        y = y.astype(np.float32)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=1000)

        cu_rf_params = {
            'n_estimators': 50,
            'max_depth': 16,
            'n_bins': 16,
        }

        workers = c.has_what().keys()
        n_partitions = partitions_per_worker * len(workers)

        X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
        X_train_df = \
            dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

        y_cudf = np.array(pd.DataFrame(y_train).values)
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)
        y_train_df = \
            dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)
        X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test))
        X_test_df = \
            dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions)

        X_train_df, y_train_df = dask_utils.persist_across_workers(
            c, [X_train_df, y_train_df], workers=workers)

        cu_rf_mg = cuRFR_mg(**cu_rf_params)
        cu_rf_mg.fit(X_train_df, y_train_df)

        cu_rf_mg_predict = cu_rf_mg.predict(X_test_df).compute()
        cu_rf_mg_predict = cp.asnumpy(cp.array(cu_rf_mg_predict))

        acc_score = r2_score(cu_rf_mg_predict, y_test)

        assert acc_score >= 0.67

    finally:
        c.close()
Пример #6
0
def test_rf_regression_dask_fil(partitions_per_worker, dtype, client):
    n_workers = len(client.scheduler_info()['workers'])

    # Use CUDA_VISIBLE_DEVICES to control the number of workers
    X, y = make_regression(n_samples=n_workers * 4000,
                           n_features=20,
                           n_informative=10,
                           random_state=123)

    X = X.astype(dtype)
    y = y.astype(dtype)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=n_workers * 100,
                         random_state=123)

    if dtype == np.float64:
        pytest.xfail(reason=" Dask RF does not support np.float64 data")

    cu_rf_params = {
        'n_estimators': 50,
        'max_depth': 16,
        'n_bins': 16,
    }

    workers = client.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    X_train_df = \
        dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

    y_cudf = cudf.Series(y_train)
    y_train_df = \
        dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)
    X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test))
    X_test_df = \
        dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions)

    cuml_mod = cuRFR_mg(**cu_rf_params, ignore_empty_partitions=True)
    cuml_mod.fit(X_train_df, y_train_df)

    cuml_mod_predict = cuml_mod.predict(X_test_df)
    cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute()))

    acc_score = r2_score(cuml_mod_predict, y_test)

    assert acc_score >= 0.67
Пример #7
0
def test_rf_throws_exceptions(cluster):
    c = Client(cluster)
    try:
        cu_rf_params = {'n_estimators': 10, 'max_depth': 8}
        cu_rf_mg = cuRFR_mg(**cu_rf_params)
        X_train, y_train = make_regression(n_samples=100, n_features=20,
                                           n_informative=10, random_state=123)
        X_train = X_train.astype(np.float32)

        X_train_df, y_train_df = _prep_training_data(c, X_train, y_train, 1)

        cu_rf_mg.fit(X_train_df, y_train_df)
        with pytest.raises(RuntimeError):
            cu_rf_mg.fit(X_train_df, y_train_df)
    finally:
        c.close()
Пример #8
0
def test_rf_regression(n_workers, partitions_per_worker):
    if dask_cuda.utils.get_n_gpus() < n_workers:
        pytest.skip("too few GPUs")

    cluster = LocalCUDACluster(threads_per_worker=1, n_workers=n_workers)
    c = Client(cluster)

    X, y = make_regression(n_samples=40000,
                           n_features=20,
                           n_informative=10,
                           random_state=123)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000)

    cu_rf_params = {
        'n_estimators': 25,
        'max_depth': 13,
    }

    workers = c.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    X_train_df = \
        dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

    y_cudf = np.array(pd.DataFrame(y_train).values)
    y_cudf = y_cudf[:, 0]
    y_cudf = cudf.Series(y_cudf)
    y_train_df = \
        dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)

    X_train_df, y_train_df = dask_utils.persist_across_workers(
        c, [X_train_df, y_train_df], workers=workers)

    cu_rf_mg = cuRFR_mg(**cu_rf_params)
    cu_rf_mg.fit(X_train_df, y_train_df)
    cu_rf_mg_predict = cu_rf_mg.predict(X_test)

    acc_score = r2_score(cu_rf_mg_predict, y_test)

    print(str(acc_score))

    assert acc_score >= 0.70

    c.close()
    cluster.close()
Пример #9
0
def test_rf_regression_dask_cpu(partitions_per_worker, client):
    n_workers = len(client.scheduler_info()['workers'])

    X, y = make_regression(n_samples=n_workers * 2000,
                           n_features=20,
                           n_informative=10,
                           random_state=123)

    X = X.astype(np.float32)
    y = y.astype(np.float32)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=n_workers * 400,
                         random_state=123)

    cu_rf_params = {
        'n_estimators': 50,
        'max_depth': 16,
        'n_bins': 16,
    }

    workers = client.has_what().keys()
    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))
    X_train_df = \
        dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)

    y_cudf = cudf.Series(y_train)
    y_train_df = \
        dask_cudf.from_cudf(y_cudf, npartitions=n_partitions)

    X_train_df, y_train_df = dask_utils.persist_across_workers(
        client, [X_train_df, y_train_df], workers=workers)

    cuml_mod = cuRFR_mg(**cu_rf_params)
    cuml_mod.fit(X_train_df, y_train_df)

    cuml_mod_predict = cuml_mod.predict(X_test, predict_model='CPU')

    acc_score = r2_score(cuml_mod_predict, y_test)

    assert acc_score >= 0.67
Пример #10
0
def test_single_input_regression(client, ignore_empty_partitions):
    X, y = make_classification(n_samples=1, n_classes=1)
    X = X.astype(np.float32)
    y = y.astype(np.float32)

    X, y = _prep_training_data(client, X, y, partitions_per_worker=2)
    cu_rf_mg = cuRFR_mg(n_bins=1,
                        ignore_empty_partitions=ignore_empty_partitions)

    if ignore_empty_partitions or \
       len(client.scheduler_info()['workers'].keys()) == 1:
        cu_rf_mg.fit(X, y)
        cuml_mod_predict = cu_rf_mg.predict(X)
        cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute()))
        y = cp.asnumpy(cp.array(y.compute()))
        assert y[0] == cuml_mod_predict[0]

    else:
        with pytest.raises(ValueError):
            cu_rf_mg.fit(X, y)
Пример #11
0
def test_rf_get_combined_model_right_aftter_fit(client, estimator_type):
    max_depth = 3
    n_estimators = 5

    n_workers = len(client.scheduler_info()['workers'])
    if n_estimators < n_workers:
        err_msg = "n_estimators cannot be lower than number of dask workers"
        pytest.xfail(err_msg)

    X, y = make_classification()
    X = X.astype(np.float32)
    if estimator_type == 'classification':
        cu_rf_mg = cuRFC_mg(max_features=1.0,
                            max_samples=1.0,
                            n_bins=16,
                            n_streams=1,
                            n_estimators=n_estimators,
                            max_leaves=-1,
                            max_depth=max_depth)
        y = y.astype(np.int32)
    elif estimator_type == 'regression':
        cu_rf_mg = cuRFR_mg(max_features=1.0,
                            max_samples=1.0,
                            n_bins=16,
                            n_streams=1,
                            n_estimators=n_estimators,
                            max_leaves=-1,
                            max_depth=max_depth)
        y = y.astype(np.float32)
    else:
        assert False
    X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2)
    cu_rf_mg.fit(X_dask, y_dask)
    single_gpu_model = cu_rf_mg.get_combined_model()
    if estimator_type == 'classification':
        assert isinstance(single_gpu_model, cuRFC_sg)
    elif estimator_type == 'regression':
        assert isinstance(single_gpu_model, cuRFR_sg)
    else:
        assert False
Пример #12
0
def test_rf_get_json(client, estimator_type, max_depth, n_estimators):
    n_workers = len(client.scheduler_info()['workers'])
    if n_estimators < n_workers:
        err_msg = "n_estimators cannot be lower than number of dask workers"
        pytest.xfail(err_msg)

    X, y = make_classification(n_samples=350,
                               n_features=20,
                               n_clusters_per_class=1,
                               n_informative=10,
                               random_state=123,
                               n_classes=2)
    X = X.astype(np.float32)
    if estimator_type == 'classification':
        cu_rf_mg = cuRFC_mg(max_features=1.0,
                            max_samples=1.0,
                            n_bins=16,
                            split_algo=0,
                            split_criterion=0,
                            min_samples_leaf=2,
                            seed=23707,
                            n_streams=1,
                            n_estimators=n_estimators,
                            max_leaves=-1,
                            max_depth=max_depth)
        y = y.astype(np.int32)
    elif estimator_type == 'regression':
        cu_rf_mg = cuRFR_mg(max_features=1.0,
                            max_samples=1.0,
                            n_bins=16,
                            split_algo=0,
                            min_samples_leaf=2,
                            seed=23707,
                            n_streams=1,
                            n_estimators=n_estimators,
                            max_leaves=-1,
                            max_depth=max_depth)
        y = y.astype(np.float32)
    else:
        assert False
    X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2)
    cu_rf_mg.fit(X_dask, y_dask)
    json_out = cu_rf_mg.get_json()
    json_obj = json.loads(json_out)

    # Test 1: Output is non-zero
    assert '' != json_out

    # Test 2: JSON object contains correct number of trees
    assert isinstance(json_obj, list)
    assert len(json_obj) == n_estimators

    # Test 3: Traverse JSON trees and get the same predictions as cuML RF
    def predict_with_json_tree(tree, x):
        if 'children' not in tree:
            assert 'leaf_value' in tree
            return tree['leaf_value']
        assert 'split_feature' in tree
        assert 'split_threshold' in tree
        assert 'yes' in tree
        assert 'no' in tree
        if x[tree['split_feature']] <= tree['split_threshold']:
            return predict_with_json_tree(tree['children'][0], x)
        return predict_with_json_tree(tree['children'][1], x)

    def predict_with_json_rf_classifier(rf, x):
        # Returns the class with the highest vote. If there is a tie, return
        # the list of all classes with the highest vote.
        vote = []
        for tree in rf:
            vote.append(predict_with_json_tree(tree, x))
        vote = np.bincount(vote)
        max_vote = np.max(vote)
        majority_vote = np.nonzero(np.equal(vote, max_vote))[0]
        return majority_vote

    def predict_with_json_rf_regressor(rf, x):
        pred = 0.
        for tree in rf:
            pred += predict_with_json_tree(tree, x)
        return pred / len(rf)

    if estimator_type == 'classification':
        expected_pred = cu_rf_mg.predict(X_dask).astype(np.int32)
        expected_pred = expected_pred.compute().to_array()
        for idx, row in enumerate(X):
            majority_vote = predict_with_json_rf_classifier(json_obj, row)
            assert expected_pred[idx] in majority_vote
    elif estimator_type == 'regression':
        expected_pred = cu_rf_mg.predict(X_dask).astype(np.float32)
        expected_pred = expected_pred.compute().to_array()
        pred = []
        for idx, row in enumerate(X):
            pred.append(predict_with_json_rf_regressor(json_obj, row))
        pred = np.array(pred, dtype=np.float32)
        np.testing.assert_almost_equal(pred, expected_pred, decimal=6)