def test_label_encoder_with_unseen_values(): df_trn = pd.DataFrame({ "col": ["b", "a", "c", np.nan, np.nan], }) df_tst = pd.DataFrame({ "col": ["x"], }) encoder_1 = LabelEncoder(sort_category=True, output_suffix="", unseen="minus_one") df_trn_1 = encoder_1.fit_transform(df_trn) assert allclose(df_trn_1["col"], np.array([1, 0, 2, -1, -1])) assert df_trn_1.columns.tolist() == ["col"] df_tst_1 = encoder_1.transform(df_tst) assert allclose(df_tst_1["col"], np.array([-1])) assert df_tst_1.columns.tolist() == ["col"] encoder_2 = LabelEncoder(sort_category=True, output_suffix="", unseen="n_unique") df_trn_2 = encoder_2.fit_transform(df_trn) assert allclose(df_trn_2["col"], np.array([1, 0, 2, -1, -1])) assert df_trn_2.columns.tolist() == ["col"] df_tst_2 = encoder_2.transform(df_tst) assert allclose(df_tst_2["col"], np.array([3])) assert df_tst_2.columns.tolist() == ["col"]
def test_aggregation(dataframes): for input_df in dataframes: group_key = "b" group_values = ["a", "c"] agg_methods = ["max"] new_df, new_cols = aggregation(input_df, group_key, group_values, agg_methods) assert new_cols == ["agg_max_a_grpby_b", "agg_max_c_grpby_b"] assert "agg_max_a_grpby_b" in new_df.columns assert "agg_max_c_grpby_b" in new_df.columns assert allclose(new_df["agg_max_a_grpby_b"], np.array([3, 3, 3, 5, 5])) assert allclose(new_df["agg_max_c_grpby_b"], np.array([1, 1, 1, 1, 1]))
def test_target_encoder_with_categorical_values(dataframes): for df in dataframes: fold = KFold(n_splits=2, shuffle=False) encoder = TargetEncoder(input_cols=["col1", "col2"], fold=fold) df_encoded = encoder.fit_transform(df) assert encoder.fold.get_n_splits() == 2 assert list(sorted( encoder._target_encoders.keys())) == ["col1", "col2"] assert allclose( df_encoded["col1_te"], np.array([ 0.0, 0.0, 0.0, 0.66666667, 1.0, 1.0, 1.0, ]), ) assert df.columns.tolist() == [ "col1", "col2", "target", ] assert df_encoded.columns.tolist() == [ "col1", "col2", "target", "col1_te", "col2_te", ]
def test_pipeline(dataframes): class DummyTransformer1(TransformerMixin): def transform(self, input_df: XDataFrame) -> XDataFrame: input_df["new1"] = 1 return input_df class DummyTransformer2(TransformerMixin): def transform(self, input_df: XDataFrame) -> XDataFrame: input_df["new2"] = 2 return input_df for df in dataframes: pipeline = Pipeline([DummyTransformer1(), DummyTransformer2()]) df = pipeline.transform(df) assert df.columns.tolist() == ["var1", "new1", "new2"] assert allclose(df["new1"], np.array([1, 1, 1])) assert allclose(df["new2"], np.array([2, 2, 2]))
def test_count_encoder_with_cat_cols(dataframes): for df in dataframes: encoder = CountEncoder() df_encoded = encoder.fit_transform(df) assert allclose(df_encoded["col1_ce"], np.array([1, 2, 2, 1])) assert df_encoded.columns.tolist() == [ "col1", "col1_ce", ]
def test_label_encoder_cat_cols(dataframes): for df in dataframes: encoder = LabelEncoder() df_encoded = encoder.fit_transform(df) assert allclose(df_encoded["col_le"], np.array([0, 0, 1])) assert df_encoded.columns.tolist() == [ "col", "col_le", ]
def test_target_encoder(dataframes_targetencoder): for df, df_test in dataframes_targetencoder: fold = KFold(n_splits=2, shuffle=False) encoder = TargetEncoder(input_cols=["col1", "col2"], fold=fold) df_encoded = encoder.fit_transform(df) assert allclose(df_encoded["col1_te"], np.array([ 0.0, 0.0, 0.0, 0.66666667, 1.0, 1.0, 1.0, ])) assert df_encoded.columns.tolist() == [ "col1", "col2", "target", "col1_te", "col2_te", ] assert df.columns.tolist() == [ "col1", "col2", "target", ] df_test_encoded = encoder.transform(df_test) assert allclose(df_test_encoded["col1_te"], np.array([0.333333, 0.833333])) assert allclose(df_test_encoded["col2_te"], np.array([0.5, 0.5])) assert df_test_encoded.columns.tolist() == [ "col1", "col2", "col1_te", "col2_te", ] assert df_test.columns.tolist() == [ "col1", "col2", ]
def test_lambda_encoder(dataframes): for df in dataframes: encoder = LambdaEncoder(lambda x: x + 1, fillna=0) df_encoded = encoder.fit_transform(df) print( df_encoded["col1_lmd"].values, df_encoded["col1_lmd"].dtype, type(df_encoded["col1_lmd"]), ) print(np.array([2, 3, 4])) assert df_encoded.columns.tolist() == ["col1", "col1_lmd"] assert allclose(df_encoded["col1_lmd"], np.array([2, 3, 4]))
def test_label_encoder_with_missing_values(): df = pd.DataFrame({ "col": ["b", "a", "c", np.nan, np.nan], }) encoder = LabelEncoder(sort_category=True) df_encoded = encoder.fit_transform(df) assert allclose(df_encoded["col_le"], np.array([1, 0, 2, -1, -1])) assert df_encoded.columns.tolist() == [ "col", "col_le", ]
def test_label_encoder_sort_category_before_factorize(): df = pd.DataFrame({ "col": ["b", "b", "a"], }) encoder = LabelEncoder(sort_category=True) df_encoded = encoder.fit_transform(df) assert allclose(df_encoded["col_le"], np.array([1, 1, 0])) assert df_encoded.columns.tolist() == [ "col", "col_le", ]
def test_count_encoder_with_num_cols(dataframes_num): for df in dataframes_num: encoder = CountEncoder(input_cols=["col1", "col2"]) df_encoded = encoder.fit_transform(df) assert allclose(df_encoded["col1_ce"], np.array([ 3, 3, 3, 4, 4, 4, 4, ])) assert df_encoded.columns.tolist() == [ "col1", "col2", "target", "col1_ce", "col2_ce", ]