def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def test_random_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [StandardScaler(), RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV( pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=2, random_state=42) # fit the search search.fit(X_train, y_train) # test the report report_grid_score_detail(search, charts=False)
def test_random_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [StandardScaler(), RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV(pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=2, random_state=42) # fit the search search.fit(X_train, y_train) # test the report report_grid_score_detail(search, charts=False)
def test_regular_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since no missing ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search (exhaustively, so for the test, just do one of each) hp = { 'collinearity__threshold': [0.90], 'collinearity__method': ['spearman'], 'scaler__scaler': [RobustScaler()], 'pca__n_components': [0.95], 'pca__whiten': [True], 'model__n_estimators': [5], 'model__max_depth': [5], 'model__min_samples_leaf': [8], 'model__max_features': [0.75], 'model__max_leaf_nodes': [20] } # define the gridsearch search = GridSearchCV(pipe, hp, scoring='accuracy', cv=custom_cv, verbose=1) # fit the search search.fit(X_train, y_train) # search.score(X_train, y_train) # throws a warning... search.predict(X_train) search.predict_proba(X_train) search.predict_log_proba(X_train) # this poses an issue.. the models are trained on X as a # array, and selecting the best estimator causes the retained # names to go away. We need to find a way to force the best_estimator_ # to retain the names on which to start the training process. # search.best_estimator_.predict(X_train) # test the report report_grid_score_detail(search, charts=False) # test with invalid X and ys assert_fails(search.fit, Exception, X_train, None) # fit the search with a series search.fit(X_train, pd.Series(y_train)) # fit the search with a DF as y search.fit(X_train, pd.DataFrame(pd.Series(y_train))) # test with invalid X and ys with warnings.catch_warnings(): warnings.simplefilter('ignore') assert_fails(search.fit, Exception, X_train, pd.DataFrame([pd.Series(y_train), pd.Series(y_train)])) # test with short y assert_fails(search.fit, ValueError, X_train, [0, 1, 2])
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42))]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{ 'random_search': grid, 'percentile': 0.0 }) assert_fails(report_grid_score_detail, ValueError, **{ 'random_search': grid, 'percentile': 1.0 }) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{ 'random_search': grid, 'y_axis': 'bad_axis' }) # assert passes otherwise report_grid_score_detail( grid, charts=True, percentile=0.95) # just ensure percentile works