def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler' : [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten' : [True, False], 'pca__weight' : [True, False], 'pca__n_components' : uniform(0.75, 0.15), 'rf__n_estimators' : randint(10, 150), 'rf__max_depth' : randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=5, scoring='accuracy', n_jobs=-1, cv=custom_cv, random_state=42) # fit the grid grid.fit(X_train, y_train) # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) tr_score, te_score = accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # do we want to do this? if not tr_score >= te_score: warnings.warn('expected training accuracy to be higher (train: %.5f, test: %.5f)' % (tr_score, te_score))