Exemplo n.º 1
0
def test_rf_classification_default(datatype, column_info, nrows):

    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=0,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train, y_train)
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_acc = accuracy_score(y_test, fil_preds)
    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cu_acc - 0.02)
Exemplo n.º 2
0
def test_rf_classification(small_clf, datatype, split_algo,
                           max_samples, max_features,
                           use_experimental_backend):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features, max_samples=max_samples,
                       n_bins=16, split_algo=split_algo, split_criterion=0,
                       min_samples_leaf=2, random_state=123, n_streams=1,
                       n_estimators=40, handle=handle, max_leaves=-1,
                       max_depth=16,
                       use_experimental_backend=use_experimental_backend)
    f = io.StringIO()
    with redirect_stdout(f):
        cuml_model.fit(X_train, y_train)
    captured_stdout = f.getvalue()
    if use_experimental_backend:
        is_fallback_used = False
        if split_algo != 1:
            assert ('Experimental backend does not yet support histogram ' +
                    'split algorithm' in captured_stdout)
            is_fallback_used = True
        if is_fallback_used:
            assert ('Not using the experimental backend due to above ' +
                    'mentioned reason(s)' in captured_stdout)
        else:
            assert ('Using experimental backend for growing trees'
                    in captured_stdout)
    else:
        assert captured_stdout == ''
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='auto')
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2, max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Exemplo n.º 3
0
def rf_classification(datatype, array_type, max_features, max_samples,
                      fixture):
    X, y = fixture
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    handle, stream = get_handle(True, n_streams=1)
    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features, max_samples=max_samples,
                       n_bins=16, split_criterion=0,
                       min_samples_leaf=2, random_state=123,
                       n_estimators=40, handle=handle, max_leaves=-1,
                       max_depth=16)
    if array_type == 'dataframe':
        X_train_df = cudf.DataFrame(X_train)
        y_train_df = cudf.Series(y_train)
        X_test_df = cudf.DataFrame(X_test)
        cuml_model.fit(X_train_df, y_train_df)
        cu_proba_gpu = np.array(cuml_model.predict_proba(X_test_df)
                                .as_gpu_matrix())
        cu_preds_cpu = cuml_model.predict(X_test_df,
                                          predict_model="CPU").to_array()
        cu_preds_gpu = cuml_model.predict(X_test_df,
                                          predict_model="GPU").to_array()
    else:
        cuml_model.fit(X_train, y_train)
        cu_proba_gpu = cuml_model.predict_proba(X_test)
        cu_preds_cpu = cuml_model.predict(X_test, predict_model="CPU")
        cu_preds_gpu = cuml_model.predict(X_test, predict_model="GPU")
    np.testing.assert_array_equal(cu_preds_gpu,
                                  np.argmax(cu_proba_gpu, axis=1))

    cu_acc_cpu = accuracy_score(y_test, cu_preds_cpu)
    cu_acc_gpu = accuracy_score(y_test, cu_preds_gpu)
    assert cu_acc_cpu == pytest.approx(cu_acc_gpu, abs=0.01, rel=0.1)

    # sklearn random forest classification model
    # initialization, fit and predict
    if y.size < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2, max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        sk_proba = sk_model.predict_proba(X_test)
        assert cu_acc_cpu >= sk_acc - 0.07
        assert cu_acc_gpu >= sk_acc - 0.07
        # 0.06 is the highest relative error observed on CI, within
        # 0.0061 absolute error boundaries seen previously
        check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)
Exemplo n.º 4
0
def test_rf_classification(datatype, split_algo, rows_sample, nrows,
                           column_info, max_features):
    use_handle = True
    ncols, n_info = column_info

    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='BATCH_TREE_REORG')
    cu_predict = cuml_model.predict(X_test, predict_model="CPU")
    cuml_acc = accuracy_score(y_test, cu_predict)
    fil_acc = accuracy_score(y_test, fil_preds)

    if nrows < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Exemplo n.º 5
0
def test_rf_classification_proba(datatype, split_algo, rows_sample, nrows,
                                 column_info, max_features):
    use_handle = True
    ncols, n_info = column_info

    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds_proba = cuml_model.predict_proba(X_test,
                                               output_class=True,
                                               threshold=0.5,
                                               algo='auto')
    y_proba = np.zeros(np.shape(fil_preds_proba))
    y_proba[:, 1] = y_test
    y_proba[:, 0] = 1.0 - y_test
    fil_mse = mean_squared_error(y_proba, fil_preds_proba)
    if nrows < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds_proba = sk_model.predict_proba(X_test)
        sk_mse = mean_squared_error(y_proba, sk_preds_proba)
        # Max difference of 0.0061 is seen between the mse values of
        # predict proba function of fil and sklearn
        assert fil_mse <= (sk_mse + 0.0061)
Exemplo n.º 6
0
def test_rf_classification(small_clf, datatype, max_samples, max_features):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=40,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
    )
    cuml_model.fit(X_train, y_train)

    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   threshold=0.5,
                                   algo="auto")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(
            n_estimators=40,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (
        cuml_acc - 0.07
    )  # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
Exemplo n.º 7
0
def test_rf_classification_dask_fil_predict_proba(partitions_per_worker,
                                                  cluster):

    c = Client(cluster)

    try:

        X, y = make_classification(n_samples=1000,
                                   n_features=30,
                                   n_clusters_per_class=1,
                                   n_informative=20,
                                   random_state=123,
                                   n_classes=2)

        X = X.astype(np.float32)
        y = y.astype(np.int32)

        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=100, random_state=123)

        cu_rf_params = {
            'n_bins': 16,
            'n_streams': 1,
            'n_estimators': 40,
            'max_depth': 16
        }

        X_train_df, y_train_df = _prep_training_data(c, X_train, y_train,
                                                     partitions_per_worker)
        X_test_df, _ = _prep_training_data(c, X_test, y_test,
                                           partitions_per_worker)
        cu_rf_mg = cuRFC_mg(**cu_rf_params)
        cu_rf_mg.fit(X_train_df, y_train_df)

        fil_preds_proba = cu_rf_mg.predict_proba(X_test_df).compute()
        fil_preds_proba = cp.asnumpy(fil_preds_proba.to_gpu_matrix())
        y_proba = np.zeros(np.shape(fil_preds_proba))
        y_proba[:, 1] = y_test
        y_proba[:, 0] = 1.0 - y_test
        fil_mse = mean_squared_error(y_proba, fil_preds_proba)
        sk_model = skrfc(n_estimators=40, max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds_proba = sk_model.predict_proba(X_test)
        sk_mse = mean_squared_error(y_proba, sk_preds_proba)

        # The threshold is required as the test would intermitently
        # fail with a max difference of 0.022 between the two mse values
        assert fil_mse <= sk_mse + 0.022

    finally:
        c.close()
Exemplo n.º 8
0
def test_rf_classification_float64(small_clf, datatype, convert_dtype):

    X, y = small_clf
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )
    X_test = X_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert cu_acc >= (sk_acc - 0.07)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32 and convert_dtype:
        fil_preds = cuml_model.predict(
            X_test, predict_model="GPU", convert_dtype=convert_dtype
        )
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

        fil_acc = accuracy_score(y_test, fil_preds)
        assert fil_acc >= (cu_acc - 0.07)  # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
    # if GPU predict cannot be used, display warning and use CPU predict
    elif datatype[1] == np.float64:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            fil_preds = cuml_model.predict(
                X_test, predict_model="GPU",
                convert_dtype=convert_dtype
            )
            assert("GPU based predict only accepts "
                   "np.float32 data. The model was "
                   "trained on np.float64 data hence "
                   "cannot use GPU-based prediction! "
                   "\nDefaulting to CPU-based Prediction. "
                   "\nTo predict on float-64 data, set "
                   "parameter predict_model = 'CPU'"
                   in str(w[-1].message))
Exemplo n.º 9
0
def test_rf_classification_dask_fil_predict_proba(partitions_per_worker,
                                                  client):
    n_workers = len(client.scheduler_info()['workers'])

    X, y = make_classification(n_samples=n_workers * 1500,
                               n_features=30,
                               n_clusters_per_class=1,
                               n_informative=20,
                               random_state=123,
                               n_classes=2)

    X = X.astype(np.float32)
    y = y.astype(np.int32)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=n_workers * 150, random_state=123)

    cu_rf_params = {
        'n_bins': 16,
        'n_streams': 1,
        'n_estimators': 40,
        'max_depth': 16
    }

    X_train_df, y_train_df = _prep_training_data(client, X_train, y_train,
                                                 partitions_per_worker)
    X_test_df, _ = _prep_training_data(client, X_test, y_test,
                                       partitions_per_worker)
    cu_rf_mg = cuRFC_mg(**cu_rf_params)
    cu_rf_mg.fit(X_train_df, y_train_df)

    fil_preds = cu_rf_mg.predict(X_test_df).compute()
    fil_preds = fil_preds.to_numpy()
    fil_preds_proba = cu_rf_mg.predict_proba(X_test_df).compute()
    fil_preds_proba = fil_preds_proba.to_numpy()
    np.testing.assert_equal(fil_preds, np.argmax(fil_preds_proba, axis=1))

    y_proba = np.zeros(np.shape(fil_preds_proba))
    y_proba[:, 1] = y_test
    y_proba[:, 0] = 1.0 - y_test
    fil_mse = mean_squared_error(y_proba, fil_preds_proba)
    sk_model = skrfc(n_estimators=40, max_depth=16, random_state=10)
    sk_model.fit(X_train, y_train)
    sk_preds_proba = sk_model.predict_proba(X_test)
    sk_mse = mean_squared_error(y_proba, sk_preds_proba)

    # The threshold is required as the test would intermitently
    # fail with a max difference of 0.029 between the two mse values
    assert fil_mse <= sk_mse + 0.029
Exemplo n.º 10
0
def test_rf_classification(small_clf, datatype, split_algo, rows_sample,
                           max_features):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       random_state=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='auto')
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Exemplo n.º 11
0
def test_rf_classification_float64(datatype, column_info, nrows,
                                   convert_dtype):

    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=0,
                               n_classes=2)
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert cu_acc >= (sk_acc - 0.07)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32 and convert_dtype:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       convert_dtype=convert_dtype)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

        fil_acc = accuracy_score(y_test, fil_preds)
        assert fil_acc >= (cu_acc - 0.02)
    else:
        with pytest.raises(TypeError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           convert_dtype=convert_dtype)
Exemplo n.º 12
0
def test_rf_predict_numpy(datatype, use_handle, split_algo, n_info, nrows,
                          ncols):
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=5)
    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(np.int32)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(np.int32)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=1.0,
                       n_bins=8,
                       split_algo=split_algo,
                       min_rows_per_node=2,
                       n_estimators=30,
                       handle=handle,
                       max_leaves=-1)
    cuml_model.fit(X_train, y_train)
    cu_predict = cuml_model.predict(X_test)
    cu_acc = accuracy_score(y_test, cu_predict)

    if nrows < 500000:
        # sklearn random forest classification model
        # initialization, fit and predict
        sk_model = skrfc(n_estimators=40,
                         max_depth=None,
                         min_samples_split=2,
                         max_features=1.0,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)

        # compare the accuracy of the two models
        assert cu_acc >= (sk_acc - 0.07)
Exemplo n.º 13
0
def test_rf_classification(datatype, split_algo,
                           n_info, nrows, ncols,
                           max_depth, rows_sample):
    use_handle = True
    if split_algo == 1 and max_depth < 0:
        pytest.xfail("Unlimited depth not supported with quantile")

    train_rows = np.int32(nrows*0.8)
    X, y = make_classification(n_samples=nrows, n_features=ncols,
                               n_clusters_per_class=1, n_informative=n_info,
                               random_state=123, n_classes=5)
    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(np.int32)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(np.int32)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=1.0, rows_sample=rows_sample,
                       n_bins=8, split_algo=split_algo, split_criterion=0,
                       min_rows_per_node=2,
                       n_estimators=40, handle=handle, max_leaves=-1,
                       max_depth=max_depth)
    cuml_model.fit(X_train, y_train)
    cu_predict = cuml_model.predict(X_test)
    cu_acc = accuracy_score(y_test, cu_predict)

    if nrows < 500000:
        # sklearn random forest classification model
        # initialization, fit and predict
        sk_model = skrfc(n_estimators=40,
                         max_depth=(max_depth if max_depth > 0 else None),
                         min_samples_split=2, max_features=1.0,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)

        # compare the accuracy of the two models
        if max_depth > 1:
            assert cu_acc >= (sk_acc - 0.07)
Exemplo n.º 14
0
def test_rf_classification_multi_class(datatype, column_info, nrows, n_classes,
                                       type):

    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=0,
                               n_classes=n_classes)
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    if type == 'dataframe':
        X_train_df = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X_train))
        y_train_df = cudf.Series(y_train)
        X_test_df = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X_test))
        cuml_model.fit(X_train_df, y_train_df)
        cu_preds = cuml_model.predict(X_test_df,
                                      predict_model="CPU").to_array()
    else:
        cuml_model.fit(X_train, y_train)
        cu_preds = cuml_model.predict(X_test, predict_model="CPU")

    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert cu_acc >= (sk_acc - 0.07)
Exemplo n.º 15
0
def test_rf_classification_float64(small_clf, datatype, convert_dtype):

    X, y = small_clf
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert cu_acc >= (sk_acc - 0.07)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32 and convert_dtype:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       convert_dtype=convert_dtype)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

        fil_acc = accuracy_score(y_test, fil_preds)
        assert fil_acc >= (cu_acc - 0.02)
    else:
        with pytest.raises(TypeError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           convert_dtype=convert_dtype)
Exemplo n.º 16
0
def test_rf_classification_float64(small_clf, datatype, convert_dtype):

    X, y = small_clf
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert cu_acc >= (sk_acc - 0.07)

    # predict using cuML's GPU based prediction
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   convert_dtype=convert_dtype)
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    fil_acc = accuracy_score(y_test, fil_preds)
    assert fil_acc >= (
        cu_acc - 0.07
    )  # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
Exemplo n.º 17
0
def test_rf_classification_sparse(small_clf, datatype,
                                  fil_sparse_format, algo):
    use_handle = True
    num_treees = 50

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(n_bins=16, split_criterion=0,
                       min_samples_leaf=2, random_state=123, n_streams=1,
                       n_estimators=num_treees, handle=handle, max_leaves=-1,
                       max_depth=40)
    cuml_model.fit(X_train, y_train)

    if ((not fil_sparse_format or algo == 'tree_reorg' or
            algo == 'batch_tree_reorg') or
            fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           output_class=True,
                                           threshold=0.5,
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       output_class=True,
                                       threshold=0.5,
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_acc = accuracy_score(y_test, fil_preds)

        fil_model = cuml_model.convert_to_fil_model()

        with cuml.using_output_type("numpy"):
            fil_model_preds = fil_model.predict(X_test)
            fil_model_acc = accuracy_score(y_test, fil_model_preds)
            assert fil_acc == fil_model_acc

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        if X.shape[0] < 500000:
            sk_model = skrfc(n_estimators=50,
                             max_depth=40,
                             min_samples_split=2,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_acc = accuracy_score(y_test, sk_preds)
            assert fil_acc >= (sk_acc - 0.07)
Exemplo n.º 18
0
def test_rf_classification_sparse(datatype, split_algo, rows_sample, nrows,
                                  column_info, max_features, fil_sparse_format,
                                  algo):
    use_handle = True
    ncols, n_info = column_info
    num_treees = 50

    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=num_treees,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=40)
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cuml_acc = accuracy_score(y_test, cu_preds)

    if ((not fil_sparse_format or algo == 'tree_reorg'
         or algo == 'batch_tree_reorg')
            or fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           output_class=True,
                                           threshold=0.5,
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       output_class=True,
                                       threshold=0.5,
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
        fil_acc = accuracy_score(y_test, fil_preds)

        fil_model = cuml_model.convert_to_fil_model()
        input_type = 'numpy'
        fil_model_preds = fil_model.predict(X_test, output_type=input_type)
        fil_model_acc = accuracy_score(y_test, fil_model_preds)
        assert fil_acc == fil_model_acc

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert ncols == tl_model.num_features
        del tl_model

        if nrows < 500000:
            sk_model = skrfc(n_estimators=50,
                             max_depth=40,
                             min_samples_split=2,
                             max_features=max_features,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_acc = accuracy_score(y_test, sk_preds)
            assert fil_acc >= (sk_acc - 0.07)

        assert fil_acc >= (cuml_acc - 0.02)