def test_mbsgd_regressor_vs_skl(lrate, penalty, make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset if nrows < 500000: cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test) cu_r2 = r2_score(cp.asnumpy(cu_pred), cp.asnumpy(y_test), convert_dtype=datatype) skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_regressor.fit(cp.asnumpy(X_train), cp.asnumpy(y_train).ravel()) skl_pred = skl_sgd_regressor.predict(cp.asnumpy(X_test)) skl_r2 = r2_score(skl_pred, cp.asnumpy(y_test), convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def test_rf_regression_float64(large_reg, datatype): X, y = large_reg X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=True) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02)
def test_elastic_net(dtype, alpha, algorithm, nrows, column_info, n_parts, client, delayed): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, n_parts=n_parts, client=client, dtype=dtype) elasticnet = ElasticNet(alpha=np.array([alpha]), fit_intercept=True, normalize=False, max_iter=1000, selection=algorithm, tol=1e-10, client=client) elasticnet.fit(X, y) y_hat = elasticnet.predict(X, delayed=delayed) # based on differences with scikit-learn 0.22 if alpha == 0.2: assert r2_score(y.compute(), y_hat.compute()) >= 0.96 else: assert r2_score(y.compute(), y_hat.compute()) >= 0.80
def test_mbsgd_regressor(datatype, lrate, input_type, penalty, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def test_lasso_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_lasso = cuLasso() cu_lasso.fit(X_train, y_train) assert cu_lasso.coef_ is not None cu_predict = cu_lasso.predict(X_test) cu_r2 = r2_score(y_test, cu_predict) sk_lasso = Lasso() sk_lasso.fit(X_train, y_train) sk_predict = sk_lasso.predict(X_test) sk_r2 = r2_score(y_test, sk_predict) assert cu_r2 >= sk_r2 - 0.07
def test_lasso(datatype, X_type, alpha, algorithm, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_lasso = cuLasso(alpha=np.array([alpha]), fit_intercept=True, normalize=False, max_iter=1000, selection=algorithm, tol=1e-10) cu_lasso.fit(X_train, y_train) assert cu_lasso.coef_ is not None cu_predict = cu_lasso.predict(X_test) cu_r2 = r2_score(y_test, cu_predict) if nrows < 500000: sk_lasso = Lasso(alpha=np.array([alpha]), fit_intercept=True, normalize=False, max_iter=1000, selection=algorithm, tol=1e-10) sk_lasso.fit(X_train, y_train) sk_predict = sk_lasso.predict(X_test) sk_r2 = r2_score(y_test, sk_predict) assert cu_r2 >= sk_r2 - 0.07
def test_mbsgd_regressor_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor() cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() skl_sgd_regressor = SGDRegressor() skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) try: assert abs(cu_r2 - skl_r2) <= 0.02 except AssertionError: pytest.xfail("failed due to AssertionError error, " "fix will be merged soon")
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(n_bins=16, split_criterion=2, min_rows_per_node=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() input_type = 'numpy' fil_model_preds = fil_model.predict(X_test, output_type=input_type) fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=40, min_samples_split=2, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07)
def test_rf_regression( special_reg, datatype, max_features, max_samples, n_bins ): use_handle = True X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr( max_features=max_features, max_samples=max_samples, n_bins=n_bins, split_criterion=2, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric="mse", ) cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress" sk_model = skrfr( n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_regression_float64(large_reg, datatype): X, y = large_reg X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction if datatype[0] == np.float32: fil_preds = cuml_model.predict( X_test, predict_model="GPU", convert_dtype=True ) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02) # because datatype[0] != np.float32 or datatype[0] != datatype[1] # display warning when GPU-predict cannot be used and revert to CPU-predict elif datatype[1] == np.float64: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") fil_preds = cuml_model.predict( X_test, predict_model="GPU" ) assert("GPU based predict only accepts " "np.float32 data. The model was " "trained on np.float64 data hence " "cannot use GPU-based prediction! " "\nDefaulting to CPU-based Prediction. " "\nTo predict on float-64 data, set " "parameter predict_model = 'CPU'" in str(w[-1].message))
def test_mbsgd_regressor_default(make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_regressor = cumlMBSGRegressor() cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) if nrows < 500000: skl_sgd_regressor = SGDRegressor() skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def test_lasso_default(dtype, nrows, column_info, n_parts, cluster): client = Client(cluster) ncols, n_info = column_info try: X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, client=client, dtype=dtype) wait(X) lasso = Lasso(client=client) lasso.fit(X, y) y_hat = lasso.predict(X) assert r2_score(y.compute(), y_hat.compute()) >= 0.99 finally: client.close()
def test_lasso(dtype, alpha, algorithm, nrows, column_info, n_parts, delayed, cluster): client = Client(cluster) ncols, n_info = column_info try: X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, n_parts=n_parts, client=client, dtype=dtype) wait(X) lasso = Lasso(alpha=np.array([alpha]), fit_intercept=True, normalize=False, max_iter=1000, selection=algorithm, tol=1e-10, client=client) lasso.fit(X, y) y_hat = lasso.predict(X, delayed=delayed) assert r2_score(y.compute(), y_hat.compute()) >= 0.99 finally: client.close()
def test_elastic_net_default(dtype, nrows, column_info, n_parts, cluster): client = Client(cluster) ncols, n_info = column_info try: X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, n_parts=n_parts, client=client, dtype=dtype) wait(X) elasticnet = ElasticNet(client=client) elasticnet.fit(X, y) y_hat = elasticnet.predict(X) assert r2_score(y.compute(), y_hat.compute()) >= 0.96 finally: client.close()
def test_rf_regression_float64(datatype, column_info, nrows, convert_dtype): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if nrows < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction if datatype[0] == np.float32 and convert_dtype: fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02) else: with pytest.raises(TypeError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype)
def test_rf_regression_default(datatype, column_info, nrows): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # score function should be equivalent score_mse = cuml_model.score(X_test, y_test, predict_model="GPU") sk_mse = mean_squared_error(y_test, fil_preds) assert sk_mse == pytest.approx(score_mse) # Initialize, fit and predict using # sklearn's random forest regression model if nrows < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) # XXX Accuracy gap exists with default parameters, requires # further investigation for next release assert fil_r2 >= (sk_r2 - 0.08) assert fil_r2 >= (cu_r2 - 0.02)
def test_mbsgd_regressor_default(make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_regressor = cumlMBSGRegressor(batch_size=nrows / 100) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test) cu_r2 = r2_score(cp.asnumpy(cu_pred), cp.asnumpy(y_test), convert_dtype=datatype) assert cu_r2 > 0.9
def test_elastic_net_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) elastic_cu = cuElasticNet() elastic_cu.fit(X_train, y_train) cu_predict = elastic_cu.predict(X_test) cu_r2 = r2_score(y_test, cu_predict) elastic_sk = ElasticNet() elastic_sk.fit(X_train, y_train) sk_predict = elastic_sk.predict(X_test) sk_r2 = r2_score(y_test, sk_predict) assert cu_r2 >= sk_r2 - 0.07
def test_mbsgd_regressor_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor() cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) if nrows < 500000: skl_sgd_regressor = SGDRegressor() skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def test_mbsgd_regressor(lrate, penalty, make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=nrows / 100, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) assert cu_r2 >= 0.9
def _r2(y_true, y_pred): return r2_score(y_true=y_true, y_pred=y_pred)
def test_rf_regression_sparse(datatype, split_algo, mode, column_info, max_features, rows_sample, fil_sparse_format, algo): ncols, n_info = column_info use_handle = True num_treees = 50 if mode == 'unit': X, y = make_regression(n_samples=500, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric='mse') cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) # predict using FIL if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() input_type = 'numpy' fil_model_preds = fil_model.predict(X_test, output_type=input_type) fil_model_preds = np.reshape(fil_model_preds, np.shape(cu_preds)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert ncols == tl_model.num_features del tl_model # Initialize, fit and predict using # sklearn's random forest regression model if mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=40, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
def _calc_score_cuml(y_true, y_preds, y_proba=None, metrics=('accuracy',), task=const.TASK_BINARY, pos_label=1, classes=None, average=None): if y_proba is None: y_proba = y_preds if len(y_proba.shape) == 2 and y_proba.shape[-1] == 1: y_proba = y_proba.reshape(-1) if len(y_preds.shape) == 2 and y_preds.shape[-1] == 1: y_preds = y_preds.reshape(-1) y_true = _to_dtype(y_true, 'float64') y_preds = _to_dtype(y_preds, 'float64') y_proba = _to_dtype(y_proba, 'float64') if task == const.TASK_REGRESSION: if isinstance(y_true, cudf.Series): y_true = y_true.values if isinstance(y_preds, cudf.Series): y_preds = y_preds.values if isinstance(y_proba, cudf.Series): y_proba = y_proba.values scores = {} for metric in metrics: if callable(metric): scores[metric.__name__] = metric(y_true, y_preds) else: metric_lower = metric.lower() if metric_lower == 'auc': if len(y_proba.shape) == 2: # if task == const.TASK_MULTICLASS: # s = cu_metrics.roc_auc_score(y_true, y_proba, multi_class='ovo', labels=classes) # else: # s = cu_metrics.roc_auc_score(y_true, y_proba[:, 1]) s = cu_metrics.roc_auc_score(y_true, y_proba[:, 1]) else: s = cu_metrics.roc_auc_score(y_true, y_proba) elif metric_lower == 'accuracy': if y_preds is None: s = 0 else: s = cu_metrics.accuracy_score(y_true, y_preds) # elif metric_lower == 'recall': # s = cu_metrics.recall_score(y_true, y_preds, **recall_options) # elif metric_lower == 'precision': # s = cu_metrics.precision_score(y_true, y_preds, **recall_options) # elif metric_lower == 'f1': # s = cu_metrics.f1_score(y_true, y_preds, **recall_options) elif metric_lower == 'mse': s = cu_metrics.mean_squared_error(y_true, y_preds) elif metric_lower == 'mae': s = cu_metrics.mean_absolute_error(y_true, y_preds) elif metric_lower == 'msle': s = cu_metrics.mean_squared_log_error(y_true, y_preds) elif metric_lower in {'rmse', 'rootmeansquarederror', 'root_mean_squared_error'}: s = cu_metrics.mean_squared_error(y_true, y_preds, squared=False) elif metric_lower == 'r2': s = cu_metrics.r2_score(y_true, y_preds) elif metric_lower in {'logloss', 'log_loss'}: # s = cu_metrics.log_loss(y_true, y_proba, labels=classes) s = cu_metrics.log_loss(y_true, y_proba) else: logger.warning(f'unknown metric: {metric}') continue if isinstance(s, cp.ndarray): s = float(cp.asnumpy(s)) scores[metric] = s return scores
def test_rf_regression(datatype, split_algo, mode, column_info, max_features, rows_sample): ncols, n_info = column_info use_handle = True if mode == 'unit': X, y = make_regression(n_samples=500, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_predict, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)