Пример #1
0
def test_mbsgd_regressor_vs_skl(lrate, penalty, make_dataset):
    nrows, datatype, X_train, X_test, y_train, y_test = make_dataset

    if nrows < 500000:

        cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate,
                                               eta0=0.005,
                                               epochs=100,
                                               fit_intercept=True,
                                               batch_size=2,
                                               tol=0.0,
                                               penalty=penalty)

        cu_mbsgd_regressor.fit(X_train, y_train)
        cu_pred = cu_mbsgd_regressor.predict(X_test)
        cu_r2 = r2_score(cp.asnumpy(cu_pred),
                         cp.asnumpy(y_test),
                         convert_dtype=datatype)

        skl_sgd_regressor = SGDRegressor(learning_rate=lrate,
                                         eta0=0.005,
                                         max_iter=100,
                                         fit_intercept=True,
                                         tol=0.0,
                                         penalty=penalty,
                                         random_state=0)

        skl_sgd_regressor.fit(cp.asnumpy(X_train), cp.asnumpy(y_train).ravel())
        skl_pred = skl_sgd_regressor.predict(cp.asnumpy(X_test))
        skl_r2 = r2_score(skl_pred, cp.asnumpy(y_test), convert_dtype=datatype)
        assert abs(cu_r2 - skl_r2) <= 0.02
Пример #2
0
def test_rf_regression_float64(large_reg, datatype):

    X, y = large_reg
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   convert_dtype=True)
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
    assert fil_r2 >= (cu_r2 - 0.02)
Пример #3
0
def test_elastic_net(dtype, alpha, algorithm, nrows, column_info, n_parts,
                     client, delayed):
    ncols, n_info = column_info

    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           n_parts=n_parts,
                           client=client,
                           dtype=dtype)

    elasticnet = ElasticNet(alpha=np.array([alpha]),
                            fit_intercept=True,
                            normalize=False,
                            max_iter=1000,
                            selection=algorithm,
                            tol=1e-10,
                            client=client)

    elasticnet.fit(X, y)

    y_hat = elasticnet.predict(X, delayed=delayed)

    # based on differences with scikit-learn 0.22
    if alpha == 0.2:
        assert r2_score(y.compute(), y_hat.compute()) >= 0.96

    else:
        assert r2_score(y.compute(), y_hat.compute()) >= 0.80
Пример #4
0
def test_mbsgd_regressor(datatype, lrate, input_type, penalty,
                         nrows, column_info):
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005,
                                           epochs=100, fit_intercept=True,
                                           batch_size=2, tol=0.0,
                                           penalty=penalty)

    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test).to_array()

    skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005,
                                     max_iter=100, fit_intercept=True,
                                     tol=0.0, penalty=penalty,
                                     random_state=0)

    skl_sgd_regressor.fit(X_train, y_train)
    skl_pred = skl_sgd_regressor.predict(X_test)

    cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype)
    skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype)
    assert abs(cu_r2 - skl_r2) <= 0.02
Пример #5
0
def test_lasso_default(datatype, nrows, column_info):

    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    cu_lasso = cuLasso()

    cu_lasso.fit(X_train, y_train)
    assert cu_lasso.coef_ is not None
    cu_predict = cu_lasso.predict(X_test)
    cu_r2 = r2_score(y_test, cu_predict)

    sk_lasso = Lasso()
    sk_lasso.fit(X_train, y_train)
    sk_predict = sk_lasso.predict(X_test)
    sk_r2 = r2_score(y_test, sk_predict)
    assert cu_r2 >= sk_r2 - 0.07
Пример #6
0
def test_lasso(datatype, X_type, alpha, algorithm,
               nrows, column_info):
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    cu_lasso = cuLasso(alpha=np.array([alpha]), fit_intercept=True,
                       normalize=False, max_iter=1000,
                       selection=algorithm, tol=1e-10)

    cu_lasso.fit(X_train, y_train)
    assert cu_lasso.coef_ is not None
    cu_predict = cu_lasso.predict(X_test)

    cu_r2 = r2_score(y_test, cu_predict)

    if nrows < 500000:
        sk_lasso = Lasso(alpha=np.array([alpha]), fit_intercept=True,
                         normalize=False, max_iter=1000,
                         selection=algorithm, tol=1e-10)
        sk_lasso.fit(X_train, y_train)
        sk_predict = sk_lasso.predict(X_test)
        sk_r2 = r2_score(y_test, sk_predict)
        assert cu_r2 >= sk_r2 - 0.07
Пример #7
0
def test_mbsgd_regressor_default(datatype, nrows,
                                 column_info):
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    cu_mbsgd_regressor = cumlMBSGRegressor()
    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test).to_array()

    skl_sgd_regressor = SGDRegressor()
    skl_sgd_regressor.fit(X_train, y_train)
    skl_pred = skl_sgd_regressor.predict(X_test)

    cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype)
    skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype)
    try:
        assert abs(cu_r2 - skl_r2) <= 0.02
    except AssertionError:
        pytest.xfail("failed due to AssertionError error, "
                     "fix will be merged soon")
Пример #8
0
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
    use_handle = True
    num_treees = 50

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(n_bins=16, split_criterion=2,
                       min_rows_per_node=2, random_state=123, n_streams=1,
                       n_estimators=num_treees, handle=handle, max_leaves=-1,
                       max_depth=40, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)

    # predict using FIL
    if ((not fil_sparse_format or algo == 'tree_reorg' or
            algo == 'batch_tree_reorg') or
            fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        input_type = 'numpy'
        fil_model_preds = fil_model.predict(X_test,
                                            output_type=input_type)
        fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test))
        fil_model_r2 = r2_score(y_test, fil_model_preds,
                                convert_dtype=datatype)
        assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if X.shape[0] < 1000:  # mode != "stress":
            sk_model = skrfr(n_estimators=50, max_depth=40,
                             min_samples_split=2,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
Пример #9
0
def test_rf_regression(
    special_reg, datatype, max_features, max_samples, n_bins
):

    use_handle = True

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=n_bins,
        split_criterion=2,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=50,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
        accuracy_metric="mse",
    )
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if X.shape[0] < 1000:  # mode != "stress"
        sk_model = skrfr(
            n_estimators=50,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)
Пример #10
0
def test_rf_regression_float64(large_reg, datatype):

    X, y = large_reg
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32:
        fil_preds = cuml_model.predict(
            X_test, predict_model="GPU", convert_dtype=True
        )
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
        assert fil_r2 >= (cu_r2 - 0.02)

    #  because datatype[0] != np.float32 or datatype[0] != datatype[1]
    # display warning when GPU-predict cannot be used and revert to CPU-predict
    elif datatype[1] == np.float64:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            fil_preds = cuml_model.predict(
                X_test, predict_model="GPU"
                )
            assert("GPU based predict only accepts "
                   "np.float32 data. The model was "
                   "trained on np.float64 data hence "
                   "cannot use GPU-based prediction! "
                   "\nDefaulting to CPU-based Prediction. "
                   "\nTo predict on float-64 data, set "
                   "parameter predict_model = 'CPU'"
                   in str(w[-1].message))
Пример #11
0
def test_mbsgd_regressor_default(make_dataset):
    nrows, datatype, X_train, X_test, y_train, y_test = make_dataset

    cu_mbsgd_regressor = cumlMBSGRegressor()
    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test)
    cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype)

    if nrows < 500000:
        skl_sgd_regressor = SGDRegressor()
        skl_sgd_regressor.fit(X_train, y_train)
        skl_pred = skl_sgd_regressor.predict(X_test)
        skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype)
        assert abs(cu_r2 - skl_r2) <= 0.02
Пример #12
0
def test_lasso_default(dtype, nrows, column_info, n_parts, cluster):

    client = Client(cluster)
    ncols, n_info = column_info

    try:

        X, y = make_regression(n_samples=nrows,
                               n_features=ncols,
                               n_informative=n_info,
                               client=client,
                               dtype=dtype)

        wait(X)

        lasso = Lasso(client=client)

        lasso.fit(X, y)

        y_hat = lasso.predict(X)

        assert r2_score(y.compute(), y_hat.compute()) >= 0.99

    finally:
        client.close()
Пример #13
0
def test_lasso(dtype, alpha, algorithm,
               nrows, column_info, n_parts, delayed, cluster):
    client = Client(cluster)
    ncols, n_info = column_info

    try:

        X, y = make_regression(n_samples=nrows,
                               n_features=ncols,
                               n_informative=n_info,
                               n_parts=n_parts,
                               client=client,
                               dtype=dtype)

        wait(X)

        lasso = Lasso(alpha=np.array([alpha]), fit_intercept=True,
                      normalize=False, max_iter=1000,
                      selection=algorithm, tol=1e-10,
                      client=client)

        lasso.fit(X, y)

        y_hat = lasso.predict(X, delayed=delayed)

        assert r2_score(y.compute(), y_hat.compute()) >= 0.99

    finally:
        client.close()
Пример #14
0
def test_elastic_net_default(dtype, nrows, column_info, n_parts, cluster):
    client = Client(cluster)
    ncols, n_info = column_info

    try:

        X, y = make_regression(n_samples=nrows,
                               n_features=ncols,
                               n_informative=n_info,
                               n_parts=n_parts,
                               client=client,
                               dtype=dtype)

        wait(X)

        elasticnet = ElasticNet(client=client)

        elasticnet.fit(X, y)

        y_hat = elasticnet.predict(X)

        assert r2_score(y.compute(), y_hat.compute()) >= 0.96

    finally:
        client.close()
Пример #15
0
def test_rf_regression_float64(datatype, column_info, nrows, convert_dtype):

    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=123)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32 and convert_dtype:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       convert_dtype=convert_dtype)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
        assert fil_r2 >= (cu_r2 - 0.02)
    else:
        with pytest.raises(TypeError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           convert_dtype=convert_dtype)
Пример #16
0
def test_rf_regression_default(datatype, column_info, nrows):

    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)

    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)

    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

    # score function should be equivalent
    score_mse = cuml_model.score(X_test, y_test, predict_model="GPU")
    sk_mse = mean_squared_error(y_test, fil_preds)
    assert sk_mse == pytest.approx(score_mse)

    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if nrows < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
        # XXX Accuracy gap exists with default parameters, requires
        # further investigation for next release
        assert fil_r2 >= (sk_r2 - 0.08)

    assert fil_r2 >= (cu_r2 - 0.02)
Пример #17
0
def test_mbsgd_regressor_default(make_dataset):
    nrows, datatype, X_train, X_test, y_train, y_test = make_dataset

    cu_mbsgd_regressor = cumlMBSGRegressor(batch_size=nrows / 100)
    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test)
    cu_r2 = r2_score(cp.asnumpy(cu_pred),
                     cp.asnumpy(y_test),
                     convert_dtype=datatype)

    assert cu_r2 > 0.9
Пример #18
0
def test_elastic_net_default(datatype, nrows, column_info):

    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    elastic_cu = cuElasticNet()
    elastic_cu.fit(X_train, y_train)
    cu_predict = elastic_cu.predict(X_test)
    cu_r2 = r2_score(y_test, cu_predict)

    elastic_sk = ElasticNet()
    elastic_sk.fit(X_train, y_train)
    sk_predict = elastic_sk.predict(X_test)
    sk_r2 = r2_score(y_test, sk_predict)
    assert cu_r2 >= sk_r2 - 0.07
Пример #19
0
def test_mbsgd_regressor_default(datatype, nrows,
                                 column_info):
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows, n_features=ncols,
                           n_informative=n_info, random_state=0)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    cu_mbsgd_regressor = cumlMBSGRegressor()
    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test).to_array()
    cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype)

    if nrows < 500000:
        skl_sgd_regressor = SGDRegressor()
        skl_sgd_regressor.fit(X_train, y_train)
        skl_pred = skl_sgd_regressor.predict(X_test)
        skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype)
        assert abs(cu_r2 - skl_r2) <= 0.02
Пример #20
0
def test_mbsgd_regressor(lrate, penalty, make_dataset):
    nrows, datatype, X_train, X_test, y_train, y_test = make_dataset

    cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate,
                                           eta0=0.005,
                                           epochs=100,
                                           fit_intercept=True,
                                           batch_size=nrows / 100,
                                           tol=0.0,
                                           penalty=penalty)

    cu_mbsgd_regressor.fit(X_train, y_train)
    cu_pred = cu_mbsgd_regressor.predict(X_test)
    cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype)

    assert cu_r2 >= 0.9
Пример #21
0
 def _r2(y_true, y_pred):
     return r2_score(y_true=y_true, y_pred=y_pred)
Пример #22
0
def test_rf_regression_sparse(datatype, split_algo, mode, column_info,
                              max_features, rows_sample, fil_sparse_format,
                              algo):

    ncols, n_info = column_info
    use_handle = True
    num_treees = 50

    if mode == 'unit':
        X, y = make_regression(n_samples=500,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=2,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=num_treees,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=40,
                       accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    # predict using FIL
    if ((not fil_sparse_format or algo == 'tree_reorg'
         or algo == 'batch_tree_reorg')
            or fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        input_type = 'numpy'
        fil_model_preds = fil_model.predict(X_test, output_type=input_type)
        fil_model_preds = np.reshape(fil_model_preds, np.shape(cu_preds))
        fil_model_r2 = r2_score(y_test,
                                fil_model_preds,
                                convert_dtype=datatype)
        assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert ncols == tl_model.num_features
        del tl_model

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if mode != "stress":
            sk_model = skrfr(n_estimators=50,
                             max_depth=40,
                             min_samples_split=2,
                             max_features=max_features,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
        assert fil_r2 >= (cu_r2 - 0.02)
Пример #23
0
def _calc_score_cuml(y_true, y_preds, y_proba=None, metrics=('accuracy',), task=const.TASK_BINARY, pos_label=1,
                     classes=None, average=None):
    if y_proba is None:
        y_proba = y_preds
    if len(y_proba.shape) == 2 and y_proba.shape[-1] == 1:
        y_proba = y_proba.reshape(-1)
    if len(y_preds.shape) == 2 and y_preds.shape[-1] == 1:
        y_preds = y_preds.reshape(-1)

    y_true = _to_dtype(y_true, 'float64')
    y_preds = _to_dtype(y_preds, 'float64')
    y_proba = _to_dtype(y_proba, 'float64')

    if task == const.TASK_REGRESSION:
        if isinstance(y_true, cudf.Series):
            y_true = y_true.values
        if isinstance(y_preds, cudf.Series):
            y_preds = y_preds.values
        if isinstance(y_proba, cudf.Series):
            y_proba = y_proba.values

    scores = {}
    for metric in metrics:
        if callable(metric):
            scores[metric.__name__] = metric(y_true, y_preds)
        else:
            metric_lower = metric.lower()
            if metric_lower == 'auc':
                if len(y_proba.shape) == 2:
                    # if task == const.TASK_MULTICLASS:
                    #     s = cu_metrics.roc_auc_score(y_true, y_proba, multi_class='ovo', labels=classes)
                    # else:
                    #     s = cu_metrics.roc_auc_score(y_true, y_proba[:, 1])
                    s = cu_metrics.roc_auc_score(y_true, y_proba[:, 1])
                else:
                    s = cu_metrics.roc_auc_score(y_true, y_proba)
            elif metric_lower == 'accuracy':
                if y_preds is None:
                    s = 0
                else:
                    s = cu_metrics.accuracy_score(y_true, y_preds)
            # elif metric_lower == 'recall':
            #     s = cu_metrics.recall_score(y_true, y_preds, **recall_options)
            # elif metric_lower == 'precision':
            #     s = cu_metrics.precision_score(y_true, y_preds, **recall_options)
            # elif metric_lower == 'f1':
            #     s = cu_metrics.f1_score(y_true, y_preds, **recall_options)
            elif metric_lower == 'mse':
                s = cu_metrics.mean_squared_error(y_true, y_preds)
            elif metric_lower == 'mae':
                s = cu_metrics.mean_absolute_error(y_true, y_preds)
            elif metric_lower == 'msle':
                s = cu_metrics.mean_squared_log_error(y_true, y_preds)
            elif metric_lower in {'rmse', 'rootmeansquarederror', 'root_mean_squared_error'}:
                s = cu_metrics.mean_squared_error(y_true, y_preds, squared=False)
            elif metric_lower == 'r2':
                s = cu_metrics.r2_score(y_true, y_preds)
            elif metric_lower in {'logloss', 'log_loss'}:
                # s = cu_metrics.log_loss(y_true, y_proba, labels=classes)
                s = cu_metrics.log_loss(y_true, y_proba)
            else:
                logger.warning(f'unknown metric: {metric}')
                continue
            if isinstance(s, cp.ndarray):
                s = float(cp.asnumpy(s))
            scores[metric] = s
    return scores
Пример #24
0
def test_rf_regression(datatype, split_algo, mode, column_info, max_features,
                       rows_sample):

    ncols, n_info = column_info
    use_handle = True

    if mode == 'unit':
        X, y = make_regression(n_samples=500,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=2,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=50,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16,
                       accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if mode != "stress":
        sk_model = skrfr(n_estimators=50,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_predict, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)