Пример #1
0
def test_automatic_variable_selection(load_diabetes_dataset):
    X, y = load_diabetes_dataset

    # add 2 additional categorical variables, these should not be evaluated by
    # the selector
    X["cat_1"] = "cat1"
    X["cat_2"] = "cat2"

    sel = RecursiveFeatureElimination(
        estimator=DecisionTreeRegressor(random_state=0),
        scoring="neg_mean_squared_error",
        cv=2,
        threshold=10,
    )
    # fit transformer
    sel.fit(X, y)

    # expected output
    Xtransformed = X[[0, 2, 3, 5, 6, 7, 8, 9, "cat_1", "cat_2"]].copy()

    # expected ordred features by importance
    ordered_features = [1, 0, 4, 6, 9, 3, 7, 5, 8, 2]

    # test init params
    assert sel.variables == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    # fit params
    assert np.round(sel.initial_model_performance_, 0) == -5836.0
    assert sel.features_to_drop_ == [1, 4]
    assert list(sel.performance_drifts_.keys()) == ordered_features
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Пример #2
0
def test_regression_cv_3_and_r2(load_diabetes_dataset):
    #  test for regression using cv=3, and the r2 as metric.
    X, y = load_diabetes_dataset
    sel = RecursiveFeatureElimination(estimator=LinearRegression(),
                                      scoring="r2",
                                      cv=3)
    sel.fit(X, y)

    # expected output
    Xtransformed = X[[1, 2, 3, 4, 5, 8]].copy()

    # expected ordred features by importance
    ordered_features = [0, 9, 6, 7, 1, 3, 5, 2, 8, 4]

    # test init params
    assert sel.cv == 3
    assert sel.variables == list(X.columns)
    assert sel.scoring == "r2"
    assert sel.threshold == 0.01
    # fit params
    assert np.round(sel.initial_model_performance_, 3) == 0.489
    assert sel.features_to_drop_ == [0, 6, 7, 9]
    assert list(sel.performance_drifts_.keys()) == ordered_features
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Пример #3
0
def test_classification_threshold_parameters(df_test):
    X, y = df_test
    sel = RecursiveFeatureElimination(RandomForestClassifier(random_state=1),
                                      threshold=0.001)
    sel.fit(X, y)

    # expected result
    Xtransformed = X[["var_0", "var_6"]].copy()

    # expected ordred features by importance
    ordered_features = [
        "var_3",
        "var_2",
        "var_11",
        "var_5",
        "var_10",
        "var_1",
        "var_8",
        "var_0",
        "var_9",
        "var_6",
        "var_4",
        "var_7",
    ]

    # test init params
    assert sel.variables == [
        "var_0",
        "var_1",
        "var_2",
        "var_3",
        "var_4",
        "var_5",
        "var_6",
        "var_7",
        "var_8",
        "var_9",
        "var_10",
        "var_11",
    ]
    assert sel.threshold == 0.001
    assert sel.cv == 3
    assert sel.scoring == "roc_auc"
    # test fit attrs
    assert np.round(sel.initial_model_performance_, 3) == 0.997
    assert sel.features_to_drop_ == [
        "var_1",
        "var_2",
        "var_3",
        "var_4",
        "var_5",
        "var_7",
        "var_8",
        "var_9",
        "var_10",
        "var_11",
    ]
    assert list(sel.performance_drifts_.keys()) == ordered_features
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Пример #4
0
def test_regression_cv_2_and_mse(load_diabetes_dataset):
    #  test for regression using cv=2, and the neg_mean_squared_error as metric.
    # add suitable threshold for regression mse

    X, y = load_diabetes_dataset
    sel = RecursiveFeatureElimination(
        estimator=DecisionTreeRegressor(random_state=0),
        scoring="neg_mean_squared_error",
        cv=2,
        threshold=10,
    )
    # fit transformer
    sel.fit(X, y)

    # expected output
    Xtransformed = X[[0, 2, 3, 5, 6, 7, 8, 9]].copy()

    # expected ordred features by importance
    ordered_features = [1, 0, 4, 6, 9, 3, 7, 5, 8, 2]

    # test init params
    assert sel.cv == 2
    assert sel.variables == list(X.columns)
    assert sel.scoring == "neg_mean_squared_error"
    assert sel.threshold == 10
    # fit params
    assert np.round(sel.initial_model_performance_, 0) == -5836.0
    assert sel.features_to_drop_ == [1, 4]
    assert list(sel.performance_drifts_.keys()) == ordered_features
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Пример #5
0
def test_classification(estimator, cv, threshold, scoring, dropped_features,
                        performances, df_test):
    X, y = df_test

    sel = RecursiveFeatureElimination(estimator=estimator,
                                      cv=cv,
                                      threshold=threshold,
                                      scoring=scoring)

    sel.fit(X, y)

    Xtransformed = X.copy()
    Xtransformed = Xtransformed.drop(labels=dropped_features, axis=1)

    # test fit attrs
    assert sel.features_to_drop_ == dropped_features

    assert len(sel.performance_drifts_.keys()) == len(X.columns)
    assert all([var in sel.performance_drifts_.keys() for var in X.columns])
    rounded_perfs = {
        key: round(sel.performance_drifts_[key], 4)
        for key in sel.performance_drifts_
    }
    assert rounded_perfs == performances

    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Пример #6
0
def test_regression(
    estimator,
    cv,
    threshold,
    scoring,
    dropped_features,
    performances,
    load_diabetes_dataset,
):
    #  test for regression using cv=3, and the r2 as metric.
    X, y = load_diabetes_dataset

    sel = RecursiveFeatureElimination(estimator=estimator,
                                      cv=cv,
                                      threshold=threshold,
                                      scoring=scoring)

    sel.fit(X, y)

    Xtransformed = X.copy()
    Xtransformed = Xtransformed.drop(labels=dropped_features, axis=1)

    # test fit attrs
    assert sel.features_to_drop_ == dropped_features

    assert len(sel.performance_drifts_.keys()) == len(X.columns)
    assert all([var in sel.performance_drifts_.keys() for var in X.columns])
    rounded_perfs = {
        key: round(sel.performance_drifts_[key], 4)
        for key in sel.performance_drifts_
    }
    assert rounded_perfs == performances

    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Пример #7
0
def test_feature_importances(_estimator, _importance, df_test):
    X, y = df_test

    sel = RecursiveFeatureAddition(_estimator, threshold=-100).fit(X, y)
    _importance.sort(reverse=True)
    assert list(np.round(sel.feature_importances_.values, 4)) == _importance

    sel = RecursiveFeatureElimination(_estimator, threshold=-100).fit(X, y)
    _importance.sort(reverse=False)
    assert list(np.round(sel.feature_importances_.values, 4)) == _importance
Пример #8
0
)

_logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1)

_estimators = [
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    DropHighPSIFeatures(bins=5),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    SelectByTargetMeanPerformance(bins=3, regression=False),
    SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_multivariate_estimators = [
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_univariate_estimators = [
    DropFeatures(features_to_drop=["var_1"]),
    DropConstantFeatures(missing_values="ignore"),
    DropHighPSIFeatures(bins=5),
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    DropHighPSIFeatures(bins=5),
    SelectByShuffling(LogisticRegression(max_iter=2, random_state=1),
                      scoring="accuracy"),
    SelectBySingleFeaturePerformance(LogisticRegression(max_iter=2,
                                                        random_state=1),
                                     scoring="accuracy"),
    RecursiveFeatureAddition(LogisticRegression(max_iter=2, random_state=1),
                             scoring="accuracy"),
    RecursiveFeatureElimination(
        LogisticRegression(max_iter=2, random_state=1),
        scoring="accuracy",
        threshold=-100,
    ),
    SelectByTargetMeanPerformance(scoring="roc_auc", bins=3, regression=False),
])
def test_sklearn_compatible_selectors(estimator, check):
    check(estimator)


# wrappers
@parametrize_with_checks([SklearnTransformerWrapper(SimpleImputer())])
def test_sklearn_compatible_wrapper(estimator, check):
    check(estimator)


# test_forecasting
    RecursiveFeatureAddition,
    RecursiveFeatureElimination,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)


@pytest.mark.parametrize(
    "Estimator",
    [
        DropFeatures(features_to_drop=["0"]),
        DropConstantFeatures(),
        DropDuplicateFeatures(),
        DropCorrelatedFeatures(),
        SmartCorrelatedSelection(),
        SelectByShuffling(RandomForestClassifier(random_state=1),
                          scoring="accuracy"),
        SelectBySingleFeaturePerformance(
            RandomForestClassifier(random_state=1), scoring="accuracy"),
        RecursiveFeatureAddition(RandomForestClassifier(random_state=1),
                                 scoring="accuracy"),
        RecursiveFeatureElimination(RandomForestClassifier(random_state=1),
                                    scoring="accuracy"),
        SelectByTargetMeanPerformance(scoring="r2_score", bins=3),
    ],
)
def test_all_transformers(Estimator):
    return check_estimator(Estimator)
Пример #11
0
def test_raises_threshold_error():
    with pytest.raises(ValueError):
        RecursiveFeatureElimination(threshold=None)
Пример #12
0
def test_raises_cv_error():
    with pytest.raises(ValueError):
        RecursiveFeatureElimination(cv=0)
Пример #13
0
def test_non_fitted_error(df_test):
    # when fit is not called prior to transform
    with pytest.raises(NotFittedError):
        sel = RecursiveFeatureElimination()
        sel.transform(df_test)