def test_feature_filter_none(): dpr = FeatureFilter(cols=None) # none should be dropped trans = dpr.fit_transform(iris) # type: pd.DataFrame assert trans.equals(iris) assert trans is not iris # assert empty drop list assert dpr.drop_ == []
def test_feature_filter_some(): dpr = FeatureFilter(cols=['a', 'b']) trans = dpr.fit_transform(iris) # only two should have been dropped assert 'a' not in trans.columns assert 'b' not in trans.columns # should be two left assert trans.shape[1] == 2 assert trans.equals(iris[['c', 'd']])
def test_complex_grid_search(): # build a pipeline pipe = Pipeline([ ('dropper', FeatureFilter()), # won't drop any ('collinearity', MultiCorrFilter(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since all full ('scaler', SelectiveMaxAbsScaler()), ('boxcox', BoxCoxTransformer(suppress_warnings=True)), ('nzv', NearZeroVarianceFilter()), ('pca', SelectivePCA(n_components=0.9)), ('custom', make_transformer(subtract_k, k=1)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'custom__k': [1, 2, 3], 'custom__func': [subtract_k, add_k], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=cv, random_state=42, # in parallel so we are testing pickling of the classes n_jobs=2) # fit the search search.fit(X_train, y_train) # Show we can profile the best estimator profile_estimator(search.best_estimator_) # Assert that it's persistable assert_persistable(pipe, "location.pkl", X_train, y_train)
def test_complex_grid_search(): # build a pipeline pipe = Pipeline([ ('dropper', FeatureFilter()), # won't drop any ('collinearity', MultiCorrFilter(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since all full ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilter()), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [None, RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=cv, random_state=42) # fit the search search.fit(X_train, y_train)
def test_filter_asdf(): assert_transformer_asdf(FeatureFilter(), iris)