def test_error_if_select_model_performance_and_y_is_none(df_single): X, y = df_single transformer = SmartCorrelatedSelection( selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", ) with pytest.raises(ValueError): transformer.fit(X)
def test_error_if_select_model_performance_and_y_is_none(df_single): X, y = df_single transformer = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=3, ) with pytest.raises(ValueError): transformer.fit(X)
def test_KFold_generators(df_test): X, y = df_test # Kfold sel = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=KFold(n_splits=3), ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all( [x for x in Xtransformed.columns if x not in sel.features_to_drop_]) # Stratfied sel = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=StratifiedKFold(n_splits=3), ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all( [x for x in Xtransformed.columns if x not in sel.features_to_drop_]) # None sel = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=None, ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all( [x for x in Xtransformed.columns if x not in sel.features_to_drop_])