def test_xgb_regression_learner(): df_train = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0], "x2": [0, 1, 1, 0], "w": [2, 1, 2, 0.5], 'y': [2.3, 4.0, 100.0, -3.9] }) df_test = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0], "x2": [1, 1, 0, 1], "w": [1, 2, 0, 0.5], 'y': [1.3, -4.0, 0.0, 49] }) features = ["x1", "x2"] learner = xgb_regression_learner(features=features, target="y", learning_rate=0.1, num_estimators=20, extra_params={ "max_depth": 2, "seed": 42 }, prediction_column="prediction", weight_column="w") predict_fn, pred_train, log = learner(df_train) pred_test = predict_fn(df_test) expected_col_train = df_train.columns.tolist() + ["prediction"] expected_col_test = df_test.columns.tolist() + ["prediction"] assert Counter(expected_col_train) == Counter(pred_train.columns.tolist()) assert Counter(expected_col_test) == Counter(pred_test.columns.tolist()) assert (pred_test.columns == pred_train.columns).all() assert "prediction" in pred_test.columns # SHAP test pred_shap = predict_fn(df_test, apply_shap=True) assert "shap_values" in pred_shap.columns assert "shap_expected_value" in pred_shap.columns assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)
def test_build_pipeline(has_repeated_learners): df_train = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0], "x2": [0, 1, 1, 0, 1, 0], "cat": ["c1", "c1", "c2", None, "c2", "c4"], 'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9] }) df_test = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0], "x2": [1, 1, 0, None, 0, 1], "cat": ["c1", "c2", "c5", None, "c2", "c3"], 'y': [1.3, -4.0, 0.0, 49, 0.0, 49] }) features = ["x1", "x2", "cat"] target = "y" train_fn = build_pipeline(placeholder_imputer(columns_to_impute=features, placeholder_value=-999), count_categorizer(columns_to_categorize=["cat"]), xgb_regression_learner(features=features, target=target, num_estimators=20, extra_params={"seed": 42}), has_repeated_learners=has_repeated_learners) predict_fn, pred_train, log = train_fn(df_train) pred_test_with_shap = predict_fn(df_test, apply_shap=True) assert set(pred_test_with_shap.columns) - set(pred_train.columns) == { "shap_values", "shap_expected_value" } pred_test_without_shap = predict_fn(df_test) assert set(pred_test_without_shap.columns) == set(pred_train.columns) pd.util.testing.assert_frame_equal( pred_test_with_shap[pred_test_without_shap.columns], pred_test_without_shap)
def test_build_pipeline_with_onehotencoder(has_repeated_learners): df_train = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0], "x2": [0, 1, 1, 0, 1, 0], "cat": ["c1", "c1", "c2", None, "c2", "c4"], 'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9] }) df_test = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0], "x2": [1, 1, 0, None, 0, 1], "cat": ["c1", "c2", "c5", None, "c2", "c3"], 'y': [1.3, -4.0, 0.0, 49, 0.0, 49] }) features = ["x1", "x2", "cat"] target = "y" train_fn = build_pipeline( placeholder_imputer(columns_to_impute=["x1", "x2"], placeholder_value=-999), onehot_categorizer(columns_to_categorize=["cat"], hardcode_nans=True), xgb_regression_learner(features=features, target=target, num_estimators=20, extra_params={"seed": 42}), has_repeated_learners=has_repeated_learners) predict_fn, pred_train, log = train_fn(df_train) pred_test = predict_fn(df_test) expected_feature_columns_after_encoding = [ "x1", "x2", "fklearn_feat__cat==c1", "fklearn_feat__cat==c2", "fklearn_feat__cat==c4", "fklearn_feat__cat==nan" ] assert set( pred_test.columns) == set(expected_feature_columns_after_encoding + ["id", target, "prediction"])