def test_default_parameters(df_test): X, y = df_test sel = SelectBySingleFeaturePerformance( RandomForestClassifier(random_state=1)) sel.fit(X, y) # expected result Xtransformed = X.copy() Xtransformed.drop("var_3", 1, inplace=True) Xtransformed.drop("var_10", 1, inplace=True) # test init params assert sel.variables == [ "var_0", "var_1", "var_2", "var_3", "var_4", "var_5", "var_6", "var_7", "var_8", "var_9", "var_10", "var_11", ] assert sel.threshold == 0.5 assert sel.cv == 3 assert sel.scoring == "roc_auc" # test fit attrs assert sel.selected_features_ == [ "var_0", "var_1", "var_2", "var_4", "var_5", "var_6", "var_7", "var_8", "var_9", "var_11", ] assert sel.feature_performance_ == { "var_0": 0.5957642619540211, "var_1": 0.5365534287221033, "var_2": 0.5001855546283257, "var_3": 0.4752954458526748, "var_4": 0.9780875304971691, "var_5": 0.5065441419357082, "var_6": 0.9758243290622809, "var_7": 0.994571685008432, "var_8": 0.5164434795458892, "var_9": 0.9543427678969847, "var_10": 0.47404183834906727, "var_11": 0.5227164067525513, } # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_regression_cv_2_and_mse(load_diabetes_dataset): # test for regression using cv=2, and the neg_mean_squared_error as metric. # add suitable threshold for regression mse X, y = load_diabetes_dataset sel = SelectBySingleFeaturePerformance( estimator=DecisionTreeRegressor(random_state=0), scoring="neg_mean_squared_error", cv=2, threshold=10, ) # fit transformer sel.fit(X, y) # expected output Xtransformed = X.copy() Xtransformed.drop( Xtransformed.columns[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], axis=1, inplace=True ) # test init params assert sel.cv == 2 assert sel.variables == list(X.columns) assert sel.scoring == "neg_mean_squared_error" assert sel.threshold == 10 # fit params assert sel.selected_features_ == [] assert sel.feature_performance_ == { 0: -7657.154138192973, 1: -5966.662211695372, 2: -6613.779604700854, 3: -6502.621725718592, 4: -9415.586278197177, 5: -11760.999622926094, 6: -6592.584431571728, 7: -5270.563893676307, 8: -7641.414795123177, 9: -6287.557824391035, } # test transform output print(sel.transform(X)) pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_automatic_variable_selection(df_test): X, y = df_test # add 2 additional categorical variables, these should not be evaluated by # the selector X["cat_1"] = "cat1" X["cat_2"] = "cat2" sel = SelectBySingleFeaturePerformance( RandomForestClassifier(random_state=1), threshold=0.5 ) sel.fit(X, y) # expected result Xtransformed = X.copy() Xtransformed.drop(columns=["var_3", "var_10"], inplace=True) # test init params assert sel.variables is None assert sel.threshold == 0.5 assert sel.cv == 3 assert sel.scoring == "roc_auc" # test fit attrs assert sel.variables_ == [ "var_0", "var_1", "var_2", "var_3", "var_4", "var_5", "var_6", "var_7", "var_8", "var_9", "var_10", "var_11", ] assert sel.features_to_drop_ == ["var_3", "var_10"] assert sel.feature_performance_ == { "var_0": 0.5957642619540211, "var_1": 0.5365534287221033, "var_2": 0.5001855546283257, "var_3": 0.4752954458526748, "var_4": 0.9780875304971691, "var_5": 0.5065441419357082, "var_6": 0.9758243290622809, "var_7": 0.994571685008432, "var_8": 0.5164434795458892, "var_9": 0.9543427678969847, "var_10": 0.47404183834906727, "var_11": 0.5227164067525513, } # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_regression_cv_3_and_r2(load_diabetes_dataset): # test for regression using cv=3, and the r2 as metric. X, y = load_diabetes_dataset sel = SelectBySingleFeaturePerformance( estimator=LinearRegression(), scoring="r2", cv=3, threshold=0.01 ) sel.fit(X, y) # expected output Xtransformed = pd.DataFrame(X[[0, 2, 3, 4, 5, 6, 7, 8, 9]].copy()) performance_dict = { 0: 0.029, 1: -0.004, 2: 0.337, 3: 0.192, 4: 0.037, 5: 0.018, 6: 0.152, 7: 0.177, 8: 0.315, 9: 0.139, } # test init params assert sel.cv == 3 assert sel.variables is None assert sel.scoring == "r2" assert sel.threshold == 0.01 # fit params assert sel.variables_ == list(X.columns) assert sel.features_to_drop_ == [1] assert all( np.round(sel.feature_performance_[f], 3) == performance_dict[f] for f in sel.feature_performance_.keys() ) # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_non_fitted_error(df_test): # when fit is not called prior to transform with pytest.raises(NotFittedError): sel = SelectBySingleFeaturePerformance() sel.transform(df_test)
def test_KFold_generators(df_test): X, y = df_test # Kfold sel = SelectBySingleFeaturePerformance( RandomForestClassifier(random_state=1), threshold=0.5, cv=KFold(n_splits=3), ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_]) assert isinstance(sel.feature_performance_, dict) assert all([x for x in X.columns if x in sel.feature_performance_.keys()]) assert all( [ isinstance(sel.feature_performance_[var], float) for var in sel.feature_performance_.keys() ] ) # Stratfied sel = SelectBySingleFeaturePerformance( RandomForestClassifier(random_state=1), threshold=0.5, cv=StratifiedKFold(n_splits=3), ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_]) assert isinstance(sel.feature_performance_, dict) assert all([x for x in X.columns if x in sel.feature_performance_.keys()]) assert all( [ isinstance(sel.feature_performance_[var], float) for var in sel.feature_performance_.keys() ] ) # None sel = SelectBySingleFeaturePerformance( RandomForestClassifier(random_state=1), threshold=0.5, cv=None, ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_]) assert isinstance(sel.feature_performance_, dict) assert all([x for x in X.columns if x in sel.feature_performance_.keys()]) assert all( [ isinstance(sel.feature_performance_[var], float) for var in sel.feature_performance_.keys() ] )
def test_non_fitted_error(df_test): # when fit is not called prior to transform with pytest.raises(NotFittedError): sel = SelectBySingleFeaturePerformance(RandomForestClassifier(random_state=1)) sel.transform(df_test)