Exemplo n.º 1
0
def test_logratio_error_if_numerator_probability_zero():
    # test case 5: when the numerator probability is zero, only applies to log_ratio
    with pytest.raises(ValueError):
        df = {
            "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4,
            "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
            "target": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        }
        df = pd.DataFrame(df)
        encoder = PRatioEncoder(encoding_method="log_ratio")
        encoder.fit(df[["var_A", "var_B"]], df["target"])
Exemplo n.º 2
0
def test_ratio_error_if_denominator_probability_zero():
    # test case 3: when the denominator probability is zero, ratio
    with pytest.raises(ValueError):
        df = {
            "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4,
            "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
            "target": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        }
        df = pd.DataFrame(df)
        encoder = PRatioEncoder(encoding_method="ratio")
        encoder.fit(df[["var_A", "var_B"]], df["target"])
def test_ratio_with_one_variable(df_enc):
    # test case 1: 1 variable, ratio
    encoder = PRatioEncoder(encoding_method="ratio", variables=["var_A"])
    encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
    X = encoder.transform(df_enc[["var_A", "var_B"]])

    # transformed dataframe
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        0.49999999999999994,
        0.49999999999999994,
        0.49999999999999994,
        0.49999999999999994,
        0.49999999999999994,
        0.49999999999999994,
        0.25,
        0.25,
        0.25,
        0.25,
        0.25,
        0.25,
        0.25,
        0.25,
        0.25,
        0.25,
        1.0,
        1.0,
        1.0,
        1.0,
    ]

    # init params
    assert encoder.encoding_method == "ratio"
    assert encoder.variables == ["var_A"]
    # fit params
    assert encoder.variables_ == ["var_A"]
    assert encoder.encoder_dict_ == {
        "var_A": {
            "A": 0.49999999999999994,
            "B": 0.25,
            "C": 1.0
        }
    }
    assert encoder.n_features_in_ == 2
    # transform params
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
Exemplo n.º 4
0
def test_logratio_and_automaticcally_select_variables(df_enc):
    # test case 2: automatically select variables, log_ratio
    encoder = PRatioEncoder(encoding_method="log_ratio", variables=None)
    encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
    X = encoder.transform(df_enc[["var_A", "var_B"]])

    # transformed dataframe
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        0.0,
        0.0,
        0.0,
        0.0,
    ]
    transf_df["var_B"] = [
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        0.0,
        0.0,
        0.0,
        0.0,
    ]

    # init params
    assert encoder.encoding_method == "log_ratio"
    assert encoder.variables is None
    # fit params
    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.encoder_dict_ == {
        "var_A": {"A": -0.6931471805599454, "B": -1.3862943611198906, "C": 0.0},
        "var_B": {"A": -1.3862943611198906, "B": -0.6931471805599454, "C": 0.0},
    }
    assert encoder.n_features_in_ == 2
    # transform params
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])

    # test error raise
    with pytest.raises(ValueError):
        PRatioEncoder(encoding_method="other")
Exemplo n.º 5
0
def test_error_if_rare_labels_not_permitted_value():
    with pytest.raises(ValueError):
        PRatioEncoder(errors="empanada")
Exemplo n.º 6
0
def test_warn_if_transform_df_contains_categories_not_seen_in_fit(df_enc, df_enc_rare):
    # test case 3: when dataset to be transformed contains categories not present
    # in training dataset
    msg = "During the encoding, NaN values were introduced in the feature(s) var_A."

    # check for warning when rare_labels equals 'ignore'
    with pytest.warns(UserWarning) as record:
        encoder = PRatioEncoder(errors="ignore")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])

    # check that only one warning was raised
    assert len(record) == 1
    # check that the message matches
    assert record[0].message.args[0] == msg

    # check for error when rare_labels equals 'raise'
    with pytest.raises(ValueError) as record:
        encoder = PRatioEncoder(errors="raise")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])

    # check that the error message matches
    assert str(record.value) == msg
Exemplo n.º 7
0
def test_logratio_on_numerical_variables(df_enc_numeric):
    # test ignore_format
    encoder = PRatioEncoder(
        encoding_method="log_ratio", variables=None, ignore_format=True
    )

    encoder.fit(df_enc_numeric[["var_A", "var_B"]], df_enc_numeric["target"])
    X = encoder.transform(df_enc_numeric[["var_A", "var_B"]])

    # transformed dataframe
    transf_df = df_enc_numeric.copy()
    transf_df["var_A"] = [
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        0.0,
        0.0,
        0.0,
        0.0,
    ]
    transf_df["var_B"] = [
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -1.3862943611198906,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        -0.6931471805599454,
        0.0,
        0.0,
        0.0,
        0.0,
    ]

    # init params
    assert encoder.encoding_method == "log_ratio"
    assert encoder.variables is None
    # fit params
    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.encoder_dict_ == {
        "var_A": {1: -0.6931471805599454, 2: -1.3862943611198906, 3: 0.0},
        "var_B": {1: -1.3862943611198906, 2: -0.6931471805599454, 3: 0.0},
    }
    assert encoder.n_features_in_ == 2
    # transform params
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])

    # test error raise
    with pytest.raises(ValueError):
        PRatioEncoder(encoding_method="other")
Exemplo n.º 8
0
def test_error_if_df_contains_na_in_transform(df_enc, df_enc_na):
    # test case 8: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = PRatioEncoder(encoding_method="ratio")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_na)
def test_raises_non_fitted_error(df_enc):
    # test case 6: non fitted error
    with pytest.raises(NotFittedError):
        imputer = PRatioEncoder()
        imputer.transform(df_enc)
# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
        n_categories=100000000000,
        replace_with=10,
        ignore_format=True,
    ),
    WoEEncoder(ignore_format=True),
    PRatioEncoder(ignore_format=True),
])
def test_sklearn_compatible_encoder(estimator, check):
    check(estimator)


# outliers
@parametrize_with_checks([
    ArbitraryOutlierCapper(max_capping_dict={"0": 10}),
    OutlierTrimmer(),
    Winsorizer(),
])
def test_sklearn_compatible_outliers(estimator, check):
    check(estimator)