def test_nzv_bad_freq_cut(): X = pd.DataFrame.from_records(data=np.array([[1, 2, 3], [4, 5, 3], [6, 7, 5]]), columns=['a', 'b', 'c']) # show fails with a bad float value nzv_float = NearZeroVarianceFilter(freq_cut=1.) assert_raises(ValueError, nzv_float.fit, X) # show fails with a non-float/int nzv_str = NearZeroVarianceFilter(freq_cut='1.') assert_raises(ValueError, nzv_str.fit, X)
def test_nzv_non_constant(): X = pd.DataFrame.from_records(data=np.array([[1, 2, 3], [4, 5, 3], [6, 7, 5]]), columns=['a', 'b', 'c']) nzv = NearZeroVarianceFilter(freq_cut=2) # show passes with an int trans = nzv.fit_transform(X) # show the output is down a column assert trans.shape[1] == 2 assert nzv.drop_ == ['c'] # show the ratios are expected assert_array_equal(nzv.ratios_, np.array([1., 1., 2.]))
def test_nzv_constant_col(): X = pd.DataFrame.from_records(data=np.array([[1, 2, 3], [4, 5, 3], [6, 7, 3], [8, 9, 3]]), columns=['a', 'b', 'c']) flt = NearZeroVarianceFilter(freq_cut=25) trans = flt.fit_transform(X) # show that the output is one column shorter assert trans.shape[1] == 2 assert flt.drop_ == ['c'] # show the ratios are expected assert_array_equal(flt.ratios_, np.array([1., 1., np.inf]))
def test_complex_grid_search(): # build a pipeline pipe = Pipeline([ ('dropper', FeatureFilter()), # won't drop any ('collinearity', MultiCorrFilter(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since all full ('scaler', SelectiveMaxAbsScaler()), ('boxcox', BoxCoxTransformer(suppress_warnings=True)), ('nzv', NearZeroVarianceFilter()), ('pca', SelectivePCA(n_components=0.9)), ('custom', make_transformer(subtract_k, k=1)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'custom__k': [1, 2, 3], 'custom__func': [subtract_k, add_k], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=cv, random_state=42, # in parallel so we are testing pickling of the classes n_jobs=2) # fit the search search.fit(X_train, y_train) # Show we can profile the best estimator profile_estimator(search.best_estimator_) # Assert that it's persistable assert_persistable(pipe, "location.pkl", X_train, y_train)
def test_complex_grid_search(): # build a pipeline pipe = Pipeline([ ('dropper', FeatureFilter()), # won't drop any ('collinearity', MultiCorrFilter(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since all full ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilter()), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [None, RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=cv, random_state=42) # fit the search search.fit(X_train, y_train)
def test_nzf_asdf(): assert_transformer_asdf(NearZeroVarianceFilter(), iris)