def test_xgb_with_regression_datasets(data, num_actors, modin_type_y): dataset, param = data num_round = 10 X_df = pd.DataFrame(dataset.data) y_df = modin_type_y(dataset.target) X_train, X_test = train_test_split(X_df) y_train, y_test = train_test_split(y_df) train_xgb_dmatrix = xgboost.DMatrix(X_train, label=y_train) test_xgb_dmatrix = xgboost.DMatrix(X_test, label=y_test) train_mxgb_dmatrix = xgb.DMatrix(X_train, label=y_train) test_mxgb_dmatrix = xgb.DMatrix(X_test, label=y_test) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, train_xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(train_xgb_dmatrix, "train"), (test_xgb_dmatrix, "test")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, train_mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(train_mxgb_dmatrix, "train"), (test_mxgb_dmatrix, "test")], num_actors=num_actors, verbose_eval=verbose_eval, ) for param in ["train", "test"]: assert len(evals_result_xgb[param]["rmse"]) == len( evals_result_mxgb[param]["rmse"]) for i in range(len(evals_result_xgb[param]["rmse"])): np.testing.assert_allclose( evals_result_xgb[param]["rmse"][i], evals_result_mxgb[param]["rmse"][i], rtol=0.0007, ) predictions = bst.predict(train_xgb_dmatrix) modin_predictions = modin_bst.predict(train_mxgb_dmatrix) val = mean_squared_error(y_train, predictions) modin_val = mean_squared_error(y_train, modin_predictions) np.testing.assert_allclose(val, modin_val, rtol=1.25e-05)
def test_xgb_with_multiclass_classification_datasets(data, num_actors, modin_type_y): dataset, param_ = data num_round = 10 part_param = {"objective": "multi:softprob", "eval_metric": "mlogloss"} param = {**param_, **part_param} X = dataset.data y = dataset.target xgb_dmatrix = xgboost.DMatrix(X, label=y) modin_X = pd.DataFrame(X) modin_y = modin_type_y(y) mxgb_dmatrix = xgb.DMatrix(modin_X, label=modin_y) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(xgb_dmatrix, "train")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(mxgb_dmatrix, "train")], num_actors=num_actors, verbose_eval=verbose_eval, ) assert len(evals_result_xgb["train"]["mlogloss"]) == len( evals_result_mxgb["train"]["mlogloss"]) for i in range(len(evals_result_xgb["train"]["mlogloss"])): np.testing.assert_allclose( evals_result_xgb["train"]["mlogloss"][i], evals_result_mxgb["train"]["mlogloss"][i], atol=0.009, ) predictions = bst.predict(xgb_dmatrix) modin_predictions = modin_bst.predict(mxgb_dmatrix) array_preds = np.asarray([np.argmax(line) for line in predictions]) modin_array_preds = np.asarray( [np.argmax(line) for line in modin_predictions.to_numpy()]) val = accuracy_score(y, array_preds) modin_val = accuracy_score(modin_y, modin_array_preds) np.testing.assert_allclose(val, modin_val)
def test_feature_names(): dataset = load_breast_cancer() X = dataset.data y = dataset.target feature_names = [f"feat{i}" for i in range(X.shape[1])] check_dmatrix( X, y, feature_names=feature_names, ) dmatrix = xgb.DMatrix(X, label=y, feature_names=feature_names) md_dmatrix = mxgb.DMatrix(pd.DataFrame(X), label=pd.Series(y), feature_names=feature_names) params = { "objective": "binary:logistic", "eval_metric": "mlogloss", } booster = xgb.train(params, dmatrix, num_boost_round=10) md_booster = mxgb.train(params, md_dmatrix, num_boost_round=10) predictions = booster.predict(dmatrix) modin_predictions = md_booster.predict(md_dmatrix) preds = pandas.DataFrame(predictions).apply(np.round, axis=0) modin_preds = modin_predictions.apply(np.round, axis=0) accuracy = accuracy_score(y, preds) md_accuracy = accuracy_score(y, modin_preds) np.testing.assert_allclose(accuracy, md_accuracy, atol=0.005, rtol=0.002) # Different feature_names (default) must raise error in this case dm = xgb.DMatrix(X) md_dm = mxgb.DMatrix(pd.DataFrame(X)) with pytest.raises(ValueError): booster.predict(dm) with pytest.raises(ValueError): repr(md_booster.predict(md_dm))
def test_xgb_with_binary_classification_datasets(data, num_actors, modin_type_y): dataset, param = data num_round = 10 X = dataset.data y = dataset.target xgb_dmatrix = xgboost.DMatrix(X, label=y) modin_X = pd.DataFrame(X) modin_y = modin_type_y(y) mxgb_dmatrix = xgb.DMatrix(modin_X, label=modin_y) evals_result_xgb = {} evals_result_mxgb = {} verbose_eval = False bst = xgboost.train( param, xgb_dmatrix, num_round, evals_result=evals_result_xgb, evals=[(xgb_dmatrix, "train")], verbose_eval=verbose_eval, ) modin_bst = xgb.train( param, mxgb_dmatrix, num_round, evals_result=evals_result_mxgb, evals=[(mxgb_dmatrix, "train")], num_actors=num_actors, verbose_eval=verbose_eval, ) for par in param["eval_metric"]: assert len(evals_result_xgb["train"][par]) == len( evals_result_xgb["train"][par]) for i in range(len(evals_result_xgb["train"][par])): np.testing.assert_allclose( evals_result_xgb["train"][par][i], evals_result_mxgb["train"][par][i], atol=0.011, ) predictions = bst.predict(xgb_dmatrix) modin_predictions = modin_bst.predict(mxgb_dmatrix) preds = pd.DataFrame(predictions).apply(lambda x: round(x)) modin_preds = modin_predictions.apply(lambda x: round(x)) val = accuracy_score(y, preds) modin_val = accuracy_score(modin_y, modin_preds) np.testing.assert_allclose(val, modin_val, atol=0.002, rtol=0.002)
def check_dmatrix(data, label=None, **kwargs): modin_data = pd.DataFrame(data) modin_label = label if label is None else pd.Series(label) try: dm = xgb.DMatrix(data, label=label, **kwargs) except Exception as xgb_exception: with pytest.raises(Exception) as mxgb_exception: mxgb.DMatrix(modin_data, label=modin_label, **kwargs) # Thrown exceptions are `XGBoostError`, which is a descendant of `ValueError`, and `ValueError` # for XGBoost and Modin, respectively, so we intentionally use `xgb_exception` # as a first parameter of `isinstance` to pass the assertion assert isinstance( xgb_exception, type(mxgb_exception.value) ), "Got Modin Exception type {}, but xgboost Exception type {} was expected".format( type(mxgb_exception.value), type(xgb_exception)) else: md_dm = mxgb.DMatrix(modin_data, label=modin_label, **kwargs) assert md_dm.num_row() == dm.num_row() assert md_dm.num_col() == dm.num_col() assert md_dm.feature_names == dm.feature_names assert md_dm.feature_types == dm.feature_types
def test_invalid_input(): list_df = [[1, 2.0, True], [2, 3.0, False]] with pytest.raises(AssertionError): # Check that DMatrix uses only DataFrame xgb.DMatrix(list_df, label=pd.Series([1, 2])) param = {} num_round = 2 with pytest.raises(AssertionError): # Check that train uses only DMatrix xgb.train(param, list_df, num_round) df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"]) modin_dtrain = xgb.DMatrix(df, label=pd.Series([1, 2])) modin_bst = xgb.train(param, modin_dtrain, num_round) dt = [[1, 2.0, 3.3], [2, 3.0, 4.4]] with pytest.raises(AssertionError): # Check that predict uses only DMatrix modin_bst.predict(dt)
def test_feature_weights(): n_rows = 10 n_cols = 50 fw = rng.uniform(size=n_cols) X = rng.randn(n_rows, n_cols) dm = xgb.DMatrix(X) md_dm = mxgb.DMatrix(pd.DataFrame(X)) dm.set_info(feature_weights=fw) md_dm.set_info(feature_weights=fw) np.testing.assert_allclose(dm.get_float_info("feature_weights"), md_dm.get_float_info("feature_weights")) # Handle empty dm.set_info(feature_weights=np.empty((0, ))) md_dm.set_info(feature_weights=np.empty((0, ))) assert (dm.get_float_info("feature_weights").shape[0] == md_dm.get_float_info("feature_weights").shape[0] == 0)
def test_backend(): try: xgb.train({}, xgb.DMatrix(pd.DataFrame([0]), pd.DataFrame([0]))) except ValueError: pass