def test_xgb_with_multiclass_classification_datasets(data, num_actors, modin_type_y): dataset, param_ = data num_round = 10 part_param = {"objective": "multi:softprob", "eval_metric": "mlogloss"} param = {**param_, **part_param} X = dataset.data y = dataset.target xgb_dmatrix = xgboost.DMatrix(X, label=y) modin_X = pd.DataFrame(X) modin_y = modin_type_y(y) mxgb_dmatrix = xgb.DMatrix(modin_X, label=modin_y) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(xgb_dmatrix, "train")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(mxgb_dmatrix, "train")], num_actors=num_actors, verbose_eval=verbose_eval, ) assert len(evals_result_xgb["train"]["mlogloss"]) == len( evals_result_mxgb["train"]["mlogloss"]) for i in range(len(evals_result_xgb["train"]["mlogloss"])): np.testing.assert_allclose( evals_result_xgb["train"]["mlogloss"][i], evals_result_mxgb["train"]["mlogloss"][i], atol=0.009, ) predictions = bst.predict(xgb_dmatrix) modin_predictions = modin_bst.predict(mxgb_dmatrix) array_preds = np.asarray([np.argmax(line) for line in predictions]) modin_array_preds = np.asarray( [np.argmax(line) for line in modin_predictions.to_numpy()]) val = accuracy_score(y, array_preds) modin_val = accuracy_score(modin_y, modin_array_preds) np.testing.assert_allclose(val, modin_val)
def test_xgb_with_binary_classification_datasets(data, num_actors, modin_type_y): dataset, param = data num_round = 10 X = dataset.data y = dataset.target xgb_dmatrix = xgboost.DMatrix(X, label=y) modin_X = pd.DataFrame(X) modin_y = modin_type_y(y) mxgb_dmatrix = xgb.DMatrix(modin_X, label=modin_y) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(xgb_dmatrix, "train")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(mxgb_dmatrix, "train")], num_actors=num_actors, verbose_eval=verbose_eval, ) for par in param["eval_metric"]: assert len(evals_result_xgb["train"][par]) == len( evals_result_xgb["train"][par]) for i in range(len(evals_result_xgb["train"][par])): np.testing.assert_allclose( evals_result_xgb["train"][par][i], evals_result_mxgb["train"][par][i], atol=0.011, ) predictions = bst.predict(xgb_dmatrix) modin_predictions = modin_bst.predict(mxgb_dmatrix) preds = pd.DataFrame(predictions).apply(lambda x: round(x)) modin_preds = modin_predictions.apply(lambda x: round(x)) val = accuracy_score(y, preds) modin_val = accuracy_score(modin_y, modin_preds) np.testing.assert_allclose(val, modin_val, atol=0.002, rtol=0.002)
def test_xgb_with_regression_datasets(data, num_actors, modin_type_y): dataset, param = data num_round = 10 X_df = pd.DataFrame(dataset.data) y_df = modin_type_y(dataset.target) X_train, X_test = train_test_split(X_df) y_train, y_test = train_test_split(y_df) train_xgb_dmatrix = xgboost.DMatrix(X_train, label=y_train) test_xgb_dmatrix = xgboost.DMatrix(X_test, label=y_test) train_mxgb_dmatrix = xgb.DMatrix(X_train, label=y_train) test_mxgb_dmatrix = xgb.DMatrix(X_test, label=y_test) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, train_xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(train_xgb_dmatrix, "train"), (test_xgb_dmatrix, "test")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, train_mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(train_mxgb_dmatrix, "train"), (test_mxgb_dmatrix, "test")], num_actors=num_actors, verbose_eval=verbose_eval, ) for param in ["train", "test"]: assert len(evals_result_xgb[param]["rmse"]) == len( evals_result_mxgb[param]["rmse"]) for i in range(len(evals_result_xgb[param]["rmse"])): np.testing.assert_allclose( evals_result_xgb[param]["rmse"][i], evals_result_mxgb[param]["rmse"][i], rtol=0.0007, ) predictions = bst.predict(train_xgb_dmatrix) modin_predictions = modin_bst.predict(train_mxgb_dmatrix) val = mean_squared_error(y_train, predictions) modin_val = mean_squared_error(y_train, modin_predictions) np.testing.assert_allclose(val, modin_val, rtol=1.25e-05)
def test_invalid_input(): list_df = [[1, 2.0, True], [2, 3.0, False]] with pytest.raises(AssertionError): # Check that DMatrix uses only DataFrame xgb.DMatrix(list_df, label=pd.Series([1, 2])) param = {} num_round = 2 with pytest.raises(AssertionError): # Check that train uses only DMatrix xgb.train(param, list_df, num_round) df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"]) modin_dtrain = xgb.DMatrix(df, label=pd.Series([1, 2])) modin_bst = xgb.train(param, modin_dtrain, num_round) dt = [[1, 2.0, 3.3], [2, 3.0, 4.4]] with pytest.raises(AssertionError): # Check that predict uses only DMatrix modin_bst.predict(dt)
def test_feature_names(): dataset = load_breast_cancer() X = dataset.data y = dataset.target feature_names = [f"feat{i}" for i in range(X.shape[1])] check_dmatrix( X, y, feature_names=feature_names, ) dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names) md_dmatrix = mxgb.DMatrix(pd.DataFrame(X), label=pd.Series(y), feature_names=feature_names) params = { "objective": "binary:logistic", "eval_metric": "mlogloss", } booster = xgb.train(params, dmatrix, num_boost_round=10) md_booster = mxgb.train(params, md_dmatrix, num_boost_round=10) predictions = booster.predict(dmatrix) modin_predictions = md_booster.predict(md_dmatrix) preds = pandas.DataFrame(predictions).apply(np.round, axis=0) modin_preds = modin_predictions.apply(np.round, axis=0) accuracy = accuracy_score(y, preds) md_accuracy = accuracy_score(y, modin_preds) np.testing.assert_allclose(accuracy, md_accuracy, atol=0.005, rtol=0.002) # Different feature_names (default) must raise error in this case dm = xgb.DMatrix(X) md_dm = mxgb.DMatrix(pd.DataFrame(X)) with pytest.raises(ValueError): booster.predict(dm) with pytest.raises(ValueError): repr(md_booster.predict(md_dm))
def test_backend(): try: xgb.train({}, xgb.DMatrix(pd.DataFrame([0]), pd.DataFrame([0]))) except ValueError: pass