def test_count_categorizer(): input_df_train = pd.DataFrame({ "feat1_num": [1, 0.5, nan, 100], "feat2_cat": ["a", "a", "a", "b"], "feat3_cat": ["c", "c", "c", nan] }) expected_output_train = pd.DataFrame({ "feat1_num": [1, 0.5, nan, 100], "feat2_cat": [3, 3, 3, 1], "feat3_cat": [3, 3, 3, nan] }) input_df_test = pd.DataFrame({ "feat1_num": [2, 20, 200, 2000], "feat2_cat": ["a", "b", "b", "d"], "feat3_cat": [nan, nan, "c", "c"] }) expected_output_test = pd.DataFrame({ "feat1_num": [2, 20, 200, 2000], "feat2_cat": [3, 1, 1, 1], # replace unseen vars with constant (1) "feat3_cat": [nan, nan, 3, 3] }) categorizer_learner = count_categorizer( columns_to_categorize=["feat2_cat", "feat3_cat"], replace_unseen=1) pred_fn, data, log = categorizer_learner(input_df_train) test_result = pred_fn(input_df_test) assert data.equals(expected_output_train) assert test_result.equals(expected_output_test)
def test_build_pipeline(has_repeated_learners): df_train = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0, None, 13.0], "x2": [0, 1, 1, 0, 1, 0], "cat": ["c1", "c1", "c2", None, "c2", "c4"], 'y': [2.3, 4.0, 100.0, -3.9, 100.0, -3.9] }) df_test = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0], "x2": [1, 1, 0, None, 0, 1], "cat": ["c1", "c2", "c5", None, "c2", "c3"], 'y': [1.3, -4.0, 0.0, 49, 0.0, 49] }) features = ["x1", "x2", "cat"] target = "y" train_fn = build_pipeline(placeholder_imputer(columns_to_impute=features, placeholder_value=-999), count_categorizer(columns_to_categorize=["cat"]), xgb_regression_learner(features=features, target=target, num_estimators=20, extra_params={"seed": 42}), has_repeated_learners=has_repeated_learners) predict_fn, pred_train, log = train_fn(df_train) pred_test_with_shap = predict_fn(df_test, apply_shap=True) assert set(pred_test_with_shap.columns) - set(pred_train.columns) == { "shap_values", "shap_expected_value" } pred_test_without_shap = predict_fn(df_test) assert set(pred_test_without_shap.columns) == set(pred_train.columns) pd.util.testing.assert_frame_equal( pred_test_with_shap[pred_test_without_shap.columns], pred_test_without_shap)
def test_count_categorizer(): input_df_train = pd.DataFrame( { "feat1_num": [1, 0.5, nan, 100], "feat2_cat": ["a", "a", "a", "b"], "feat3_cat": ["c", "c", "c", nan], } ) expected_output_train = pd.DataFrame( { "feat1_num": [1, 0.5, nan, 100], "feat2_cat": [3, 3, 3, 1], "feat3_cat": [3, 3, 3, nan], } ) input_df_test = pd.DataFrame( { "feat1_num": [2, 20, 200, 2000], "feat2_cat": ["a", "b", "b", "d"], "feat3_cat": [nan, nan, "c", "c"], } ) expected_output_test = pd.DataFrame( { "feat1_num": [2, 20, 200, 2000], "feat2_cat": [3, 1, 1, 1], # replace unseen vars with constant (1) "feat3_cat": [nan, nan, 3, 3], } ) categorizer_learner1 = count_categorizer( columns_to_categorize=["feat2_cat", "feat3_cat"], replace_unseen=1 ) categorizer_learner2 = count_categorizer( columns_to_categorize=["feat2_cat", "feat3_cat"], replace_unseen=1, suffix="_suffix", ) categorizer_learner3 = count_categorizer( columns_to_categorize=["feat2_cat", "feat3_cat"], replace_unseen=1, prefix="prefix_", ) categorizer_learner4 = count_categorizer( columns_to_categorize=["feat2_cat", "feat3_cat"], replace_unseen=1, columns_mapping={ "feat2_cat": "feat2_cat_raw", "feat3_cat": "feat3_cat_raw", }, ) pred_fn1, data1, log = categorizer_learner1(input_df_train) pred_fn2, data2, log = categorizer_learner2(input_df_train) pred_fn3, data3, log = categorizer_learner3(input_df_train) pred_fn4, data4, log = categorizer_learner4(input_df_train) assert expected_output_train.equals(data1) assert expected_output_test.equals(pred_fn1(input_df_test)) categorized = ["feat2_cat", "feat3_cat"] assert pd.concat( [ expected_output_train, input_df_train[categorized].copy().add_suffix("_suffix"), ], axis=1, ).equals(data2) assert pd.concat( [ expected_output_test, input_df_test[categorized].copy().add_suffix("_suffix"), ], axis=1, ).equals(pred_fn2(input_df_test)) assert pd.concat( [ expected_output_train, input_df_train[categorized].copy().add_prefix("prefix_"), ], axis=1, ).equals(data3) assert pd.concat( [ expected_output_test, input_df_test[categorized].copy().add_prefix("prefix_"), ], axis=1, ).equals(pred_fn3(input_df_test)) assert pd.concat( [ expected_output_train, input_df_train[categorized].copy().add_suffix("_raw"), ], axis=1, ).equals(data4) assert pd.concat( [ expected_output_test, input_df_test[categorized].copy().add_suffix("_raw"), ], axis=1, ).equals(pred_fn4(input_df_test))