示例#1
0
def test_label_encoder_with_unseen_values():
    df_trn = pd.DataFrame({
        "col": ["b", "a", "c", np.nan, np.nan],
    })
    df_tst = pd.DataFrame({
        "col": ["x"],
    })

    encoder_1 = LabelEncoder(sort_category=True,
                             output_suffix="",
                             unseen="minus_one")
    df_trn_1 = encoder_1.fit_transform(df_trn)
    assert allclose(df_trn_1["col"], np.array([1, 0, 2, -1, -1]))
    assert df_trn_1.columns.tolist() == ["col"]

    df_tst_1 = encoder_1.transform(df_tst)
    assert allclose(df_tst_1["col"], np.array([-1]))
    assert df_tst_1.columns.tolist() == ["col"]

    encoder_2 = LabelEncoder(sort_category=True,
                             output_suffix="",
                             unseen="n_unique")
    df_trn_2 = encoder_2.fit_transform(df_trn)
    assert allclose(df_trn_2["col"], np.array([1, 0, 2, -1, -1]))
    assert df_trn_2.columns.tolist() == ["col"]

    df_tst_2 = encoder_2.transform(df_tst)
    assert allclose(df_tst_2["col"], np.array([3]))
    assert df_tst_2.columns.tolist() == ["col"]
示例#2
0
def test_aggregation(dataframes):
    for input_df in dataframes:
        group_key = "b"
        group_values = ["a", "c"]
        agg_methods = ["max"]

        new_df, new_cols = aggregation(input_df, group_key, group_values,
                                       agg_methods)
        assert new_cols == ["agg_max_a_grpby_b", "agg_max_c_grpby_b"]
        assert "agg_max_a_grpby_b" in new_df.columns
        assert "agg_max_c_grpby_b" in new_df.columns
        assert allclose(new_df["agg_max_a_grpby_b"], np.array([3, 3, 3, 5, 5]))
        assert allclose(new_df["agg_max_c_grpby_b"], np.array([1, 1, 1, 1, 1]))
def test_target_encoder_with_categorical_values(dataframes):
    for df in dataframes:
        fold = KFold(n_splits=2, shuffle=False)
        encoder = TargetEncoder(input_cols=["col1", "col2"], fold=fold)
        df_encoded = encoder.fit_transform(df)

        assert encoder.fold.get_n_splits() == 2
        assert list(sorted(
            encoder._target_encoders.keys())) == ["col1", "col2"]

        assert allclose(
            df_encoded["col1_te"],
            np.array([
                0.0,
                0.0,
                0.0,
                0.66666667,
                1.0,
                1.0,
                1.0,
            ]),
        )
        assert df.columns.tolist() == [
            "col1",
            "col2",
            "target",
        ]
        assert df_encoded.columns.tolist() == [
            "col1",
            "col2",
            "target",
            "col1_te",
            "col2_te",
        ]
示例#4
0
def test_pipeline(dataframes):
    class DummyTransformer1(TransformerMixin):
        def transform(self, input_df: XDataFrame) -> XDataFrame:
            input_df["new1"] = 1
            return input_df

    class DummyTransformer2(TransformerMixin):
        def transform(self, input_df: XDataFrame) -> XDataFrame:
            input_df["new2"] = 2
            return input_df

    for df in dataframes:
        pipeline = Pipeline([DummyTransformer1(), DummyTransformer2()])
        df = pipeline.transform(df)
        assert df.columns.tolist() == ["var1", "new1", "new2"]
        assert allclose(df["new1"], np.array([1, 1, 1]))
        assert allclose(df["new2"], np.array([2, 2, 2]))
示例#5
0
def test_count_encoder_with_cat_cols(dataframes):
    for df in dataframes:
        encoder = CountEncoder()
        df_encoded = encoder.fit_transform(df)
        assert allclose(df_encoded["col1_ce"], np.array([1, 2, 2, 1]))
        assert df_encoded.columns.tolist() == [
            "col1",
            "col1_ce",
        ]
示例#6
0
def test_label_encoder_cat_cols(dataframes):
    for df in dataframes:
        encoder = LabelEncoder()
        df_encoded = encoder.fit_transform(df)
        assert allclose(df_encoded["col_le"], np.array([0, 0, 1]))
        assert df_encoded.columns.tolist() == [
            "col",
            "col_le",
        ]
def test_target_encoder(dataframes_targetencoder):
    for df, df_test in dataframes_targetencoder:
        fold = KFold(n_splits=2, shuffle=False)
        encoder = TargetEncoder(input_cols=["col1", "col2"], fold=fold)
        df_encoded = encoder.fit_transform(df)
        assert allclose(df_encoded["col1_te"],
                        np.array([
                            0.0,
                            0.0,
                            0.0,
                            0.66666667,
                            1.0,
                            1.0,
                            1.0,
                        ]))
        assert df_encoded.columns.tolist() == [
            "col1",
            "col2",
            "target",
            "col1_te",
            "col2_te",
        ]
        assert df.columns.tolist() == [
            "col1",
            "col2",
            "target",
        ]

        df_test_encoded = encoder.transform(df_test)

        assert allclose(df_test_encoded["col1_te"],
                        np.array([0.333333, 0.833333]))
        assert allclose(df_test_encoded["col2_te"], np.array([0.5, 0.5]))
        assert df_test_encoded.columns.tolist() == [
            "col1",
            "col2",
            "col1_te",
            "col2_te",
        ]
        assert df_test.columns.tolist() == [
            "col1",
            "col2",
        ]
示例#8
0
def test_lambda_encoder(dataframes):
    for df in dataframes:
        encoder = LambdaEncoder(lambda x: x + 1, fillna=0)
        df_encoded = encoder.fit_transform(df)
        print(
            df_encoded["col1_lmd"].values,
            df_encoded["col1_lmd"].dtype,
            type(df_encoded["col1_lmd"]),
        )
        print(np.array([2, 3, 4]))
        assert df_encoded.columns.tolist() == ["col1", "col1_lmd"]
        assert allclose(df_encoded["col1_lmd"], np.array([2, 3, 4]))
示例#9
0
def test_label_encoder_with_missing_values():
    df = pd.DataFrame({
        "col": ["b", "a", "c", np.nan, np.nan],
    })

    encoder = LabelEncoder(sort_category=True)
    df_encoded = encoder.fit_transform(df)
    assert allclose(df_encoded["col_le"], np.array([1, 0, 2, -1, -1]))
    assert df_encoded.columns.tolist() == [
        "col",
        "col_le",
    ]
示例#10
0
def test_label_encoder_sort_category_before_factorize():
    df = pd.DataFrame({
        "col": ["b", "b", "a"],
    })

    encoder = LabelEncoder(sort_category=True)
    df_encoded = encoder.fit_transform(df)
    assert allclose(df_encoded["col_le"], np.array([1, 1, 0]))
    assert df_encoded.columns.tolist() == [
        "col",
        "col_le",
    ]
示例#11
0
def test_count_encoder_with_num_cols(dataframes_num):
    for df in dataframes_num:
        encoder = CountEncoder(input_cols=["col1", "col2"])
        df_encoded = encoder.fit_transform(df)
        assert allclose(df_encoded["col1_ce"], np.array([
            3,
            3,
            3,
            4,
            4,
            4,
            4,
        ]))
        assert df_encoded.columns.tolist() == [
            "col1",
            "col2",
            "target",
            "col1_ce",
            "col2_ce",
        ]