def test_cardinality_2_correlated_groups(df_double):
    X, y = df_double
    X[["var_0", "var_6", "var_7",
       "var_9"]] = X[["var_0", "var_6", "var_7", "var_9"]].astype(int)

    transformer = SmartCorrelatedSelection(
        variables=None,
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="cardinality",
        estimator=None,
    )

    Xt = transformer.fit_transform(X, y)

    # expected result
    df = X[[
        "var_1", "var_2", "var_3", "var_4", "var_5", "var_8", "var_10",
        "var_11"
    ]].copy()

    assert transformer.features_to_drop_ == [
        "var_0",
        "var_6",
        "var_7",
        "var_9",
    ]
    # test transform output
    pd.testing.assert_frame_equal(Xt, df)
def test_model_performance_2_correlated_groups(df_double):
    X, y = df_double

    transformer = SmartCorrelatedSelection(
        variables=None,
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="model_performance",
        estimator=RandomForestClassifier(n_estimators=10, random_state=1),
        scoring="roc_auc",
        cv=3,
    )

    Xt = transformer.fit_transform(X, y)

    # expected result
    df = X[[
        "var_0", "var_1", "var_2", "var_3", "var_5", "var_7", "var_10",
        "var_11"
    ]].copy()

    # test fit attrs
    assert transformer.correlated_feature_sets_ == [
        {"var_0", "var_8"},
        {"var_4", "var_6", "var_7", "var_9"},
    ]
    assert transformer.features_to_drop_ == [
        "var_4",
        "var_6",
        "var_8",
        "var_9",
    ]
    # test transform output
    pd.testing.assert_frame_equal(Xt, df)
def test_error_if_select_model_performance_and_y_is_none(df_single):
    X, y = df_single

    transformer = SmartCorrelatedSelection(
        selection_method="model_performance",
        estimator=RandomForestClassifier(n_estimators=10, random_state=1),
        scoring="roc_auc",
    )

    with pytest.raises(ValueError):
        transformer.fit(X)
def test_raises_param_errors():
    with pytest.raises(ValueError):
        SmartCorrelatedSelection(threshold=None)

    with pytest.raises(ValueError):
        SmartCorrelatedSelection(missing_values=None)

    with pytest.raises(ValueError):
        SmartCorrelatedSelection(selection_method="random")

    with pytest.raises(ValueError):
        SmartCorrelatedSelection(selection_method="missing_values",
                                 missing_values="raise")
def test_callable_method(df_test, random_uniform_method):
    X, _ = df_test

    transformer = SmartCorrelatedSelection(method=random_uniform_method, )

    Xt = transformer.fit_transform(X)

    # test no empty dataframe
    assert not Xt.empty

    # test fit attrs
    assert len(transformer.correlated_feature_sets_) > 0
    assert len(transformer.features_to_drop_) > 0
    assert len(transformer.variables_) > 0
    assert transformer.n_features_in_ == len(X.columns)
def test_error_if_select_model_performance_and_y_is_none(df_single):
    X, y = df_single

    transformer = SmartCorrelatedSelection(
        variables=None,
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="model_performance",
        estimator=RandomForestClassifier(n_estimators=10, random_state=1),
        scoring="roc_auc",
        cv=3,
    )

    with pytest.raises(ValueError):
        transformer.fit(X)
def test_error_method_supplied(df_test):

    X, _ = df_test
    method = "hola"

    transformer = SmartCorrelatedSelection(method=method)

    with pytest.raises(ValueError) as errmsg:
        _ = transformer.fit_transform(X)

    exceptionmsg = errmsg.value.args[0]

    assert (
        exceptionmsg ==
        "method must be either 'pearson', 'spearman', 'kendall', or a callable,"
        + f" '{method}' was supplied")
def test_automatic_variable_selection(df_double):
    X, y = df_double

    X[["var_0", "var_6", "var_7",
       "var_9"]] = X[["var_0", "var_6", "var_7", "var_9"]].astype(int)

    # add 2 additional categorical variables, these should not be evaluated by
    # the selector
    X["cat_1"] = "cat1"
    X["cat_2"] = "cat2"

    transformer = SmartCorrelatedSelection(
        variables=None,
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="cardinality",
        estimator=None,
    )

    Xt = transformer.fit_transform(X, y)

    # expected result
    df = X[[
        "var_1",
        "var_2",
        "var_3",
        "var_4",
        "var_5",
        "var_8",
        "var_10",
        "var_11",
        "cat_1",
        "cat_2",
    ]].copy()

    assert transformer.features_to_drop_ == [
        "var_0",
        "var_6",
        "var_7",
        "var_9",
    ]
    # test transform output
    pd.testing.assert_frame_equal(Xt, df)
def test_callable_method(df_test, random_uniform_method):
    X, _ = df_test

    transformer = SmartCorrelatedSelection(
        variables=None,
        method=random_uniform_method,
        threshold=0.8,
        missing_values="raise",
        selection_method="variance",
    )

    Xt = transformer.fit_transform(X)

    # test no empty dataframe
    assert not Xt.empty

    # test fit attrs
    assert len(transformer.correlated_feature_sets_) > 0
    assert len(transformer.features_to_drop_) > 0
    assert len(transformer.variables_) > 0
    assert transformer.n_features_in_ == len(X.columns)
def test_error_method_supplied(df_test):

    X, _ = df_test
    method = "hola"

    transformer = SmartCorrelatedSelection(
        variables=None,
        method=method,
        threshold=0.8,
        missing_values="raise",
        selection_method="variance",
    )

    with pytest.raises(ValueError) as errmsg:
        _ = transformer.fit_transform(X)

    exceptionmsg = errmsg.value.args[0]

    assert (
        exceptionmsg ==
        "method must be either 'pearson', 'spearman', 'kendall', or a callable,"
        + f" '{method}' was supplied")
def test_non_fitted_error(df_single):
    X, y = df_single
    # when fit is not called prior to transform
    with pytest.raises(NotFittedError):
        transformer = SmartCorrelatedSelection()
        transformer.transform(X)

    transformer = SmartCorrelatedSelection(
        variables=None,
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="model_performance",
        estimator=RandomForestClassifier(n_estimators=10, random_state=1),
        scoring="roc_auc",
        cv=3,
    )
def test_non_fitted_error(df_single):
    X, y = df_single
    # when fit is not called prior to transform
    with pytest.raises(NotFittedError):
        transformer = SmartCorrelatedSelection()
        transformer.transform(X)
def test_error_if_fit_input_not_dataframe():
    with pytest.raises(TypeError):
        SmartCorrelatedSelection().fit({"Name": [1]})
Exemplo n.º 14
0
    RecursiveFeatureElimination,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)

_logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1)

_estimators = [
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    DropHighPSIFeatures(bins=5),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    SelectByTargetMeanPerformance(bins=3, regression=False),
    SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_multivariate_estimators = [
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]
def test_KFold_generators(df_test):
    X, y = df_test

    # Kfold
    sel = SmartCorrelatedSelection(
        variables=None,
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="model_performance",
        estimator=RandomForestClassifier(n_estimators=10, random_state=1),
        scoring="roc_auc",
        cv=KFold(n_splits=3),
    )
    sel.fit(X, y)
    Xtransformed = sel.transform(X)

    # test fit attrs
    assert isinstance(sel.features_to_drop_, list)
    assert all([x for x in sel.features_to_drop_ if x in X.columns])
    assert len(sel.features_to_drop_) < X.shape[1]
    assert not Xtransformed.empty
    assert all(
        [x for x in Xtransformed.columns if x not in sel.features_to_drop_])

    # Stratfied
    sel = SmartCorrelatedSelection(
        variables=None,
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="model_performance",
        estimator=RandomForestClassifier(n_estimators=10, random_state=1),
        scoring="roc_auc",
        cv=StratifiedKFold(n_splits=3),
    )
    sel.fit(X, y)
    Xtransformed = sel.transform(X)

    # test fit attrs
    assert isinstance(sel.features_to_drop_, list)
    assert all([x for x in sel.features_to_drop_ if x in X.columns])
    assert len(sel.features_to_drop_) < X.shape[1]
    assert not Xtransformed.empty
    assert all(
        [x for x in Xtransformed.columns if x not in sel.features_to_drop_])

    # None
    sel = SmartCorrelatedSelection(
        variables=None,
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="model_performance",
        estimator=RandomForestClassifier(n_estimators=10, random_state=1),
        scoring="roc_auc",
        cv=None,
    )
    sel.fit(X, y)
    Xtransformed = sel.transform(X)

    # test fit attrs
    assert isinstance(sel.features_to_drop_, list)
    assert all([x for x in sel.features_to_drop_ if x in X.columns])
    assert len(sel.features_to_drop_) < X.shape[1]
    assert not Xtransformed.empty
    assert all(
        [x for x in Xtransformed.columns if x not in sel.features_to_drop_])