def test_numerical_variables_roc_auc(df_test):
    X, y = df_test

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="roc_auc_score",
        threshold=0.6,
        bins=5,
        strategy="equal_width",
        cv=3,
        random_state=1,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["var_0", "var_4", "var_6", "var_7", "var_9"]]
    # performance_dict = {
    #     "var_0": 0.628,
    #     "var_1": 0.548,
    #     "var_2": 0.513,
    #     "var_3": 0.474,
    #     "var_4": 0.973,
    #     "var_5": 0.496,
    #     "var_6": 0.97,
    #     "var_7": 0.992,
    #     "var_8": 0.536,
    #     "var_9": 0.931,
    #     "var_10": 0.466,
    #     "var_11": 0.517,
    # }

    # test init params
    assert sel.variables is None
    assert sel.scoring == "roc_auc_score"
    assert sel.threshold == 0.6
    assert sel.bins == 5
    assert sel.strategy == "equal_width"
    assert sel.cv == 3
    assert sel.random_state == 1

    # test fit attrs
    assert sel.variables_ == list(X.columns)
    assert sel.variables_categorical_ == []
    assert sel.variables_numerical_ == list(X.columns)
    assert sel.features_to_drop_ == [
        "var_1",
        "var_2",
        "var_3",
        "var_5",
        "var_8",
        "var_10",
        "var_11",
    ]
    # assert all(
    #     np.round(sel.feature_performance_[f], 3) == performance_dict[f]
    #     for f in sel.feature_performance_.keys()
    # )
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
示例#2
0
def test_categorical_variables_roc_auc(df_test_num_cat):
    X, y = df_test_num_cat
    X = X[["var_A", "var_B"]]

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="roc_auc_score",
        threshold=0.78,
        cv=2,
        random_state=1,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X["var_A"].to_frame()
    # performance_dict = {"var_A": 0.841, "var_B": 0.776}

    # test init params
    assert sel.variables == list(X.columns)
    assert sel.scoring == "roc_auc_score"
    assert sel.threshold == 0.78
    assert sel.cv == 2
    assert sel.random_state == 1

    # test fit attrs
    assert sel.variables_categorical_ == list(X.columns)
    assert sel.variables_numerical_ == []
    assert sel.features_to_drop_ == ["var_B"]
    # assert all(
    #     np.round(sel.feature_performance_[f], 3) == performance_dict[f]
    #     for f in sel.feature_performance_.keys()
    # )
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_raises_error_if_evaluating_single_variable_and_threshold_is_None(
        df_test):
    X, y = df_test

    sel = SelectByTargetMeanPerformance(variables=["var_1"], threshold=None)

    with pytest.raises(ValueError):
        sel.fit(X, y)
def test_regression():

    X, y = df_regression()

    sel = SelectByTargetMeanPerformance(
        variables=None,
        bins=2,
        scoring="r2",
        regression=True,
        cv=2,
        strategy="equal_width",
        threshold=None,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["cat_var_A", "cat_var_B", "num_var_A"]]
    performance_dict = {
        "cat_var_A": 1.0,
        "cat_var_B": 0.8533333333333333,
        "num_var_A": 0.8,
        "num_var_B": 0.512,
    }

    assert sel.features_to_drop_ == ["num_var_B"]
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)

    X, y = df_regression()

    sel = SelectByTargetMeanPerformance(
        variables=["cat_var_A", "cat_var_B", "num_var_A", "num_var_B"],
        bins=2,
        scoring="neg_root_mean_squared_error",
        regression=True,
        cv=2,
        strategy="equal_width",
        threshold=-0.2,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X["cat_var_A"].to_frame()
    performance_dict = {
        "cat_var_A": 0.0,
        "cat_var_B": -0.42817441928883765,
        "num_var_A": -0.5,
        "num_var_B": -0.7810249675906654,
    }

    assert sel.features_to_drop_ == ["cat_var_B", "num_var_A", "num_var_B"]
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_classification():

    X, y = df_classification()

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="accuracy",
        threshold=None,
        bins=2,
        strategy="equal_width",
        cv=2,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["cat_var_A", "num_var_A"]]

    performance_dict = {
        "cat_var_A": 1.0,
        "cat_var_B": 0.8,
        "num_var_A": 1.0,
        "num_var_B": 0.8,
    }
    features_to_drop = ["cat_var_B", "num_var_B"]

    assert sel.features_to_drop_ == features_to_drop
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)

    sel = SelectByTargetMeanPerformance(
        variables=["cat_var_A", "cat_var_B", "num_var_A", "num_var_B"],
        scoring="roc_auc",
        threshold=0.9,
        bins=2,
        strategy="equal_frequency",
        cv=2,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["cat_var_A", "cat_var_B", "num_var_A"]]

    performance_dict = {
        "cat_var_A": 1.0,
        "cat_var_B": 0.92,
        "num_var_A": 1.0,
        "num_var_B": 0.8,
    }
    features_to_drop = ["num_var_B"]

    assert sel.features_to_drop_ == features_to_drop
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_error_wrong_params():
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(scoring="mean_squared")
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(scoring=1)
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(threshold="hola")
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(bins="hola")
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(strategy="hola")
def test_test_selector_with_one_variable():

    X, y = df_regression()

    sel = SelectByTargetMeanPerformance(
        variables=["cat_var_A"],
        bins=2,
        scoring="neg_root_mean_squared_error",
        regression=True,
        cv=2,
        strategy="equal_width",
        threshold=-0.2,
    )

    sel.fit(X, y)

    # expected result
    performance_dict = {"cat_var_A": 0.0}

    assert sel.features_to_drop_ == []
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), X)

    X, y = df_regression()

    sel = SelectByTargetMeanPerformance(
        variables=["cat_var_B"],
        bins=2,
        scoring="neg_root_mean_squared_error",
        regression=True,
        cv=2,
        strategy="equal_width",
        threshold=-0.2,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X.drop(columns=["cat_var_B"])
    performance_dict = {"cat_var_B": -0.42817441928883765}

    assert sel.features_to_drop_ == ["cat_var_B"]
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_df_cat_and_num_variables_r2(df_test_num_cat):
    X, y = df_test_num_cat

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="r2_score",
        threshold=0.1,
        bins=3,
        strategy="equal_frequency",
        cv=2,
        random_state=1,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["var_A", "var_B"]]
    # performance_dict = {
    #     "var_A": 0.392,
    #     "var_B": 0.250,
    #     "var_C": -0.004,
    #     "var_D": -0.052,
    # }

    # test init params
    assert sel.variables is None
    assert sel.scoring == "r2_score"
    assert sel.threshold == 0.1
    assert sel.cv == 2
    assert sel.bins == 3
    assert sel.strategy == "equal_frequency"
    assert sel.random_state == 1

    # test fit attrs
    assert sel.variables_ == list(X.columns)
    assert sel.variables_categorical_ == ["var_A", "var_B"]
    assert sel.variables_numerical_ == ["var_C", "var_D"]
    assert sel.features_to_drop_ == ["var_C", "var_D"]
    # assert all(
    #     np.round(sel.feature_performance_[f], 3) == performance_dict[f]
    #     for f in sel.feature_performance_.keys()
    # )
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
示例#9
0
def test_df_cat_and_num_variables_roc_auc(df_test_num_cat):
    X, y = df_test_num_cat

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="roc_auc_score",
        threshold=0.6,
        bins=3,
        strategy="equal_width",
        cv=2,
        random_state=1,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["var_A", "var_B"]]
    performance_dict = {
        "var_A": 0.841,
        "var_B": 0.776,
        "var_C": 0.481,
        "var_D": 0.496
    }

    # test init params
    assert sel.variables == list(X.columns)
    assert sel.scoring == "roc_auc_score"
    assert sel.threshold == 0.60
    assert sel.cv == 2
    assert sel.random_state == 1

    # test fit attrs
    assert sel.variables_categorical_ == ["var_A", "var_B"]
    assert sel.variables_numerical_ == ["var_C", "var_D"]
    assert sel.features_to_drop_ == ["var_C", "var_D"]
    assert all(
        np.round(sel.feature_performance_[f], 3) == performance_dict[f]
        for f in sel.feature_performance_.keys())
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
示例#10
0
def test_error_wrong_params():
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(scoring="mean_squared")
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(scoring=1)
    with pytest.raises(ValueError):
        # test error if roc-aud and threshold < 0.4
        SelectByTargetMeanPerformance(scoring="roc_auc_score", threshold=0.4)
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(threshold=-1)
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(threshold="hola")
    with pytest.raises(TypeError):
        SelectByTargetMeanPerformance(bins="hola")
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(strategy="hola")
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(cv="hola")
    with pytest.raises(ValueError):
        SelectByTargetMeanPerformance(cv=1)
示例#11
0
def test_not_fitted_error(df_test):
    with pytest.raises(NotFittedError):
        transformer = SelectByTargetMeanPerformance()
        transformer.transform(df_test)
示例#12
0
def test_error_if_fit_input_not_dataframe(df_test):
    with pytest.raises(TypeError):
        SelectByTargetMeanPerformance().fit({"Name": ["Karthik"]})
示例#13
0
def test_error_if_input_not_df(df_test):
    X, y = df_test
    with pytest.raises(TypeError):
        SelectByTargetMeanPerformance().fit(X.to_dict(), y)
示例#14
0
def test_error_if_y_not_passed(df_test):
    X, y = df_test
    with pytest.raises(TypeError):
        SelectByTargetMeanPerformance().fit(X)
示例#15
0
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)

_logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1)

_estimators = [
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    DropHighPSIFeatures(bins=5),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    SelectByTargetMeanPerformance(bins=3, regression=False),
    SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_multivariate_estimators = [
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_univariate_estimators = [
    RecursiveFeatureAddition,
    RecursiveFeatureElimination,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)


@pytest.mark.parametrize(
    "Estimator",
    [
        DropFeatures(features_to_drop=["0"]),
        DropConstantFeatures(),
        DropDuplicateFeatures(),
        DropCorrelatedFeatures(),
        SmartCorrelatedSelection(),
        SelectByShuffling(RandomForestClassifier(random_state=1),
                          scoring="accuracy"),
        SelectBySingleFeaturePerformance(
            RandomForestClassifier(random_state=1), scoring="accuracy"),
        RecursiveFeatureAddition(RandomForestClassifier(random_state=1),
                                 scoring="accuracy"),
        RecursiveFeatureElimination(RandomForestClassifier(random_state=1),
                                    scoring="accuracy"),
        SelectByTargetMeanPerformance(scoring="r2_score", bins=3),
    ],
)
def test_all_transformers(Estimator):
    return check_estimator(Estimator)
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    DropHighPSIFeatures(bins=5),
    SelectByShuffling(LogisticRegression(max_iter=2, random_state=1),
                      scoring="accuracy"),
    SelectBySingleFeaturePerformance(LogisticRegression(max_iter=2,
                                                        random_state=1),
                                     scoring="accuracy"),
    RecursiveFeatureAddition(LogisticRegression(max_iter=2, random_state=1),
                             scoring="accuracy"),
    RecursiveFeatureElimination(
        LogisticRegression(max_iter=2, random_state=1),
        scoring="accuracy",
        threshold=-100,
    ),
    SelectByTargetMeanPerformance(scoring="roc_auc", bins=3, regression=False),
])
def test_sklearn_compatible_selectors(estimator, check):
    check(estimator)


# wrappers
@parametrize_with_checks([SklearnTransformerWrapper(SimpleImputer())])
def test_sklearn_compatible_wrapper(estimator, check):
    check(estimator)


# test_forecasting
@parametrize_with_checks([LagFeatures(missing_values="ignore")])
def test_sklearn_compatible_forecasters(estimator, check):
    check(estimator)