def test_numerical_variables_roc_auc(df_test): X, y = df_test sel = SelectByTargetMeanPerformance( variables=None, scoring="roc_auc_score", threshold=0.6, bins=5, strategy="equal_width", cv=3, random_state=1, ) sel.fit(X, y) # expected result Xtransformed = X[["var_0", "var_4", "var_6", "var_7", "var_9"]] # performance_dict = { # "var_0": 0.628, # "var_1": 0.548, # "var_2": 0.513, # "var_3": 0.474, # "var_4": 0.973, # "var_5": 0.496, # "var_6": 0.97, # "var_7": 0.992, # "var_8": 0.536, # "var_9": 0.931, # "var_10": 0.466, # "var_11": 0.517, # } # test init params assert sel.variables is None assert sel.scoring == "roc_auc_score" assert sel.threshold == 0.6 assert sel.bins == 5 assert sel.strategy == "equal_width" assert sel.cv == 3 assert sel.random_state == 1 # test fit attrs assert sel.variables_ == list(X.columns) assert sel.variables_categorical_ == [] assert sel.variables_numerical_ == list(X.columns) assert sel.features_to_drop_ == [ "var_1", "var_2", "var_3", "var_5", "var_8", "var_10", "var_11", ] # assert all( # np.round(sel.feature_performance_[f], 3) == performance_dict[f] # for f in sel.feature_performance_.keys() # ) # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_categorical_variables_roc_auc(df_test_num_cat): X, y = df_test_num_cat X = X[["var_A", "var_B"]] sel = SelectByTargetMeanPerformance( variables=None, scoring="roc_auc_score", threshold=0.78, cv=2, random_state=1, ) sel.fit(X, y) # expected result Xtransformed = X["var_A"].to_frame() # performance_dict = {"var_A": 0.841, "var_B": 0.776} # test init params assert sel.variables == list(X.columns) assert sel.scoring == "roc_auc_score" assert sel.threshold == 0.78 assert sel.cv == 2 assert sel.random_state == 1 # test fit attrs assert sel.variables_categorical_ == list(X.columns) assert sel.variables_numerical_ == [] assert sel.features_to_drop_ == ["var_B"] # assert all( # np.round(sel.feature_performance_[f], 3) == performance_dict[f] # for f in sel.feature_performance_.keys() # ) # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_raises_error_if_evaluating_single_variable_and_threshold_is_None( df_test): X, y = df_test sel = SelectByTargetMeanPerformance(variables=["var_1"], threshold=None) with pytest.raises(ValueError): sel.fit(X, y)
def test_regression(): X, y = df_regression() sel = SelectByTargetMeanPerformance( variables=None, bins=2, scoring="r2", regression=True, cv=2, strategy="equal_width", threshold=None, ) sel.fit(X, y) # expected result Xtransformed = X[["cat_var_A", "cat_var_B", "num_var_A"]] performance_dict = { "cat_var_A": 1.0, "cat_var_B": 0.8533333333333333, "num_var_A": 0.8, "num_var_B": 0.512, } assert sel.features_to_drop_ == ["num_var_B"] assert sel.feature_performance_ == performance_dict pd.testing.assert_frame_equal(sel.transform(X), Xtransformed) X, y = df_regression() sel = SelectByTargetMeanPerformance( variables=["cat_var_A", "cat_var_B", "num_var_A", "num_var_B"], bins=2, scoring="neg_root_mean_squared_error", regression=True, cv=2, strategy="equal_width", threshold=-0.2, ) sel.fit(X, y) # expected result Xtransformed = X["cat_var_A"].to_frame() performance_dict = { "cat_var_A": 0.0, "cat_var_B": -0.42817441928883765, "num_var_A": -0.5, "num_var_B": -0.7810249675906654, } assert sel.features_to_drop_ == ["cat_var_B", "num_var_A", "num_var_B"] assert sel.feature_performance_ == performance_dict pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_classification(): X, y = df_classification() sel = SelectByTargetMeanPerformance( variables=None, scoring="accuracy", threshold=None, bins=2, strategy="equal_width", cv=2, ) sel.fit(X, y) # expected result Xtransformed = X[["cat_var_A", "num_var_A"]] performance_dict = { "cat_var_A": 1.0, "cat_var_B": 0.8, "num_var_A": 1.0, "num_var_B": 0.8, } features_to_drop = ["cat_var_B", "num_var_B"] assert sel.features_to_drop_ == features_to_drop assert sel.feature_performance_ == performance_dict pd.testing.assert_frame_equal(sel.transform(X), Xtransformed) sel = SelectByTargetMeanPerformance( variables=["cat_var_A", "cat_var_B", "num_var_A", "num_var_B"], scoring="roc_auc", threshold=0.9, bins=2, strategy="equal_frequency", cv=2, ) sel.fit(X, y) # expected result Xtransformed = X[["cat_var_A", "cat_var_B", "num_var_A"]] performance_dict = { "cat_var_A": 1.0, "cat_var_B": 0.92, "num_var_A": 1.0, "num_var_B": 0.8, } features_to_drop = ["num_var_B"] assert sel.features_to_drop_ == features_to_drop assert sel.feature_performance_ == performance_dict pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_error_wrong_params(): with pytest.raises(ValueError): SelectByTargetMeanPerformance(scoring="mean_squared") with pytest.raises(ValueError): SelectByTargetMeanPerformance(scoring=1) with pytest.raises(ValueError): SelectByTargetMeanPerformance(threshold="hola") with pytest.raises(ValueError): SelectByTargetMeanPerformance(bins="hola") with pytest.raises(ValueError): SelectByTargetMeanPerformance(strategy="hola")
def test_test_selector_with_one_variable(): X, y = df_regression() sel = SelectByTargetMeanPerformance( variables=["cat_var_A"], bins=2, scoring="neg_root_mean_squared_error", regression=True, cv=2, strategy="equal_width", threshold=-0.2, ) sel.fit(X, y) # expected result performance_dict = {"cat_var_A": 0.0} assert sel.features_to_drop_ == [] assert sel.feature_performance_ == performance_dict pd.testing.assert_frame_equal(sel.transform(X), X) X, y = df_regression() sel = SelectByTargetMeanPerformance( variables=["cat_var_B"], bins=2, scoring="neg_root_mean_squared_error", regression=True, cv=2, strategy="equal_width", threshold=-0.2, ) sel.fit(X, y) # expected result Xtransformed = X.drop(columns=["cat_var_B"]) performance_dict = {"cat_var_B": -0.42817441928883765} assert sel.features_to_drop_ == ["cat_var_B"] assert sel.feature_performance_ == performance_dict pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_df_cat_and_num_variables_r2(df_test_num_cat): X, y = df_test_num_cat sel = SelectByTargetMeanPerformance( variables=None, scoring="r2_score", threshold=0.1, bins=3, strategy="equal_frequency", cv=2, random_state=1, ) sel.fit(X, y) # expected result Xtransformed = X[["var_A", "var_B"]] # performance_dict = { # "var_A": 0.392, # "var_B": 0.250, # "var_C": -0.004, # "var_D": -0.052, # } # test init params assert sel.variables is None assert sel.scoring == "r2_score" assert sel.threshold == 0.1 assert sel.cv == 2 assert sel.bins == 3 assert sel.strategy == "equal_frequency" assert sel.random_state == 1 # test fit attrs assert sel.variables_ == list(X.columns) assert sel.variables_categorical_ == ["var_A", "var_B"] assert sel.variables_numerical_ == ["var_C", "var_D"] assert sel.features_to_drop_ == ["var_C", "var_D"] # assert all( # np.round(sel.feature_performance_[f], 3) == performance_dict[f] # for f in sel.feature_performance_.keys() # ) # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_df_cat_and_num_variables_roc_auc(df_test_num_cat): X, y = df_test_num_cat sel = SelectByTargetMeanPerformance( variables=None, scoring="roc_auc_score", threshold=0.6, bins=3, strategy="equal_width", cv=2, random_state=1, ) sel.fit(X, y) # expected result Xtransformed = X[["var_A", "var_B"]] performance_dict = { "var_A": 0.841, "var_B": 0.776, "var_C": 0.481, "var_D": 0.496 } # test init params assert sel.variables == list(X.columns) assert sel.scoring == "roc_auc_score" assert sel.threshold == 0.60 assert sel.cv == 2 assert sel.random_state == 1 # test fit attrs assert sel.variables_categorical_ == ["var_A", "var_B"] assert sel.variables_numerical_ == ["var_C", "var_D"] assert sel.features_to_drop_ == ["var_C", "var_D"] assert all( np.round(sel.feature_performance_[f], 3) == performance_dict[f] for f in sel.feature_performance_.keys()) # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_error_wrong_params(): with pytest.raises(ValueError): SelectByTargetMeanPerformance(scoring="mean_squared") with pytest.raises(ValueError): SelectByTargetMeanPerformance(scoring=1) with pytest.raises(ValueError): # test error if roc-aud and threshold < 0.4 SelectByTargetMeanPerformance(scoring="roc_auc_score", threshold=0.4) with pytest.raises(ValueError): SelectByTargetMeanPerformance(threshold=-1) with pytest.raises(ValueError): SelectByTargetMeanPerformance(threshold="hola") with pytest.raises(TypeError): SelectByTargetMeanPerformance(bins="hola") with pytest.raises(ValueError): SelectByTargetMeanPerformance(strategy="hola") with pytest.raises(ValueError): SelectByTargetMeanPerformance(cv="hola") with pytest.raises(ValueError): SelectByTargetMeanPerformance(cv=1)
def test_not_fitted_error(df_test): with pytest.raises(NotFittedError): transformer = SelectByTargetMeanPerformance() transformer.transform(df_test)
def test_error_if_fit_input_not_dataframe(df_test): with pytest.raises(TypeError): SelectByTargetMeanPerformance().fit({"Name": ["Karthik"]})
def test_error_if_input_not_df(df_test): X, y = df_test with pytest.raises(TypeError): SelectByTargetMeanPerformance().fit(X.to_dict(), y)
def test_error_if_y_not_passed(df_test): X, y = df_test with pytest.raises(TypeError): SelectByTargetMeanPerformance().fit(X)
SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) _logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1) _estimators = [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), DropHighPSIFeatures(bins=5), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), SelectByTargetMeanPerformance(bins=3, regression=False), SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _multivariate_estimators = [ DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _univariate_estimators = [
RecursiveFeatureAddition, RecursiveFeatureElimination, SelectByShuffling, SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) @pytest.mark.parametrize( "Estimator", [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(), DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), SelectByShuffling(RandomForestClassifier(random_state=1), scoring="accuracy"), SelectBySingleFeaturePerformance( RandomForestClassifier(random_state=1), scoring="accuracy"), RecursiveFeatureAddition(RandomForestClassifier(random_state=1), scoring="accuracy"), RecursiveFeatureElimination(RandomForestClassifier(random_state=1), scoring="accuracy"), SelectByTargetMeanPerformance(scoring="r2_score", bins=3), ], ) def test_all_transformers(Estimator): return check_estimator(Estimator)
DropCorrelatedFeatures(), SmartCorrelatedSelection(), DropHighPSIFeatures(bins=5), SelectByShuffling(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), SelectBySingleFeaturePerformance(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), RecursiveFeatureAddition(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), RecursiveFeatureElimination( LogisticRegression(max_iter=2, random_state=1), scoring="accuracy", threshold=-100, ), SelectByTargetMeanPerformance(scoring="roc_auc", bins=3, regression=False), ]) def test_sklearn_compatible_selectors(estimator, check): check(estimator) # wrappers @parametrize_with_checks([SklearnTransformerWrapper(SimpleImputer())]) def test_sklearn_compatible_wrapper(estimator, check): check(estimator) # test_forecasting @parametrize_with_checks([LagFeatures(missing_values="ignore")]) def test_sklearn_compatible_forecasters(estimator, check): check(estimator)