def test_capper(): input_df = pd.DataFrame({ 'feat1': [10, 13, 50], 'feat2': [50, 75, None], }) input_df2 = pd.DataFrame({ 'feat1': [7, 15], 'feat2': [200, None], }) expected1 = pd.DataFrame({ 'feat1': [9, 9, 9], 'feat2': [50, 75, None], }) expected2 = pd.DataFrame({ 'feat1': [7, 9], 'feat2': [75, None], }) pred_fn, data, log = capper(input_df, ["feat1", "feat2"], {'feat1': 9.0}) assert expected1.equals(data) assert expected2.equals(pred_fn(input_df2))
def test_capper(): input_df = pd.DataFrame({"feat1": [10, 13, 50], "feat2": [50, 75, None]}) input_df2 = pd.DataFrame({"feat1": [7, 15], "feat2": [200, None]}) expected1 = pd.DataFrame({"feat1": [9, 9, 9], "feat2": [50, 75, None]}) expected2 = pd.DataFrame({"feat1": [7, 9], "feat2": [75, None]}) pred_fn1, data1, log = capper(input_df, ["feat1", "feat2"], {"feat1": 9}) pred_fn2, data2, log = capper( input_df, ["feat1", "feat2"], {"feat1": 9}, suffix="_suffix" ) pred_fn3, data3, log = capper( input_df, ["feat1", "feat2"], {"feat1": 9}, prefix="prefix_" ) pred_fn4, data4, log = capper( input_df, ["feat1", "feat2"], {"feat1": 9}, columns_mapping={"feat1": "feat1_raw", "feat2": "feat2_raw"}, ) assert expected1.equals(data1) assert expected2.equals(pred_fn1(input_df2)) assert pd.concat( [expected1, input_df.copy().add_suffix("_suffix")], axis=1 ).equals(data2) assert pd.concat( [expected2, input_df2.copy().add_suffix("_suffix")], axis=1 ).equals(pred_fn2(input_df2)) assert pd.concat( [expected1, input_df.copy().add_prefix("prefix_")], axis=1 ).equals(data3) assert pd.concat( [expected2, input_df2.copy().add_prefix("prefix_")], axis=1 ).equals(pred_fn3(input_df2)) assert pd.concat( [expected1, input_df.copy().add_suffix("_raw")], axis=1 ).equals(data4) assert pd.concat( [expected2, input_df2.copy().add_suffix("_raw")], axis=1 ).equals(pred_fn4(input_df2))
df.columns = ["income"] df["bill_amount"] = data_bill_amount * 10000 df["income"] = df["income"].apply(lambda x: x * 1000) print(f"turned our test data into an income dataframe...\n {df.head()}") # ---------------------------------------------------------------------------------------------------------------------- # Get to the actual work. from fklearn.training.regression import linear_regression_learner from fklearn.training.transformation import capper, floorer, prediction_ranger # initialize several learner functions # 1. one function to cap the input data to ignore outliers. # 2. then a usual regression # 3. third again we'd min/max the output of the regression capper_fn = capper(columns_to_cap=["income"], precomputed_caps={"income": 500}) regression_fn = linear_regression_learner(features=["income"], target="bill_amount") ranger_fn = prediction_ranger(prediction_min=0.0, prediction_max=200.0) # apply two by currieing them together... from fklearn.training.pipeline import build_pipeline learner = build_pipeline(capper_fn, regression_fn, ranger_fn) p, df, log = learner(df) print( f" the returned dataframe now contains our capped prediction:\n {df.head(5)}" )