示例#1
0
def test_large_grid():
	"""In this test, we purposely overfit a RandomForest to completely random data
	in order to assert that the test error will far supercede the train error.
	"""

	custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)

	# define the pipe
	pipe = Pipeline([
			('scaler', SelectiveScaler()),
			('pca',    SelectivePCA(weight=True)),
			('rf',     RandomForestClassifier(random_state=42))
		])

	# define hyper parameters
	hp = {
		'scaler__scaler' : [StandardScaler(), RobustScaler(), MinMaxScaler()],
		'pca__whiten' : [True, False],
		'pca__weight' : [True, False],
		'pca__n_components' : uniform(0.75, 0.15),
		'rf__n_estimators' : randint(10, 150),
		'rf__max_depth' : randint(5, 15)
	}

	# define the grid
	grid = RandomizedSearchCV(pipe, hp, n_iter=5, scoring='accuracy', n_jobs=-1, cv=custom_cv, random_state=42)

	# fit the grid
	grid.fit(X_train, y_train)

	# get predictions
	tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

	# evaluate score (SHOULD be better than random...)
	tr_score, te_score = accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

	# do we want to do this?
	if not tr_score >= te_score:
		warnings.warn('expected training accuracy to be higher (train: %.5f, test: %.5f)' % (tr_score, te_score))