def test_grid_search_failing_classifier(self): # ATGridSearchCV with on_error != 'raise' # Ensures that a warning is raised and score reset where appropriate. X, y = make_classification(n_samples=20, n_features=10, random_state=0) clf = FailingClassifier() # refit=False because we only want to check that errors caused by fits # to individual folds will be caught and warnings raised instead. If # refit was done, then an exception would be raised on refit and not # caught by grid_search (expected behavior), and this would cause an # error in this test. gs = ATGridSearchCV(clf, [{ 'parameter': [0, 1, 2] }], scoring='accuracy', refit=False, error_score=0.0, webserver_url=self.live_server_url) #assert_warns(FitFailedWarning, gs.fit, X, y) wait(gs.fit(X, y)) # Ensure that grid scores were set to zero as required for those fits # that are expected to fail. assert all( np.all(this_point.cv_validation_scores == 0.0) for this_point in gs.grid_scores_ if this_point.parameters['parameter'] == FailingClassifier.FAILING_PARAMETER) gs = ATGridSearchCV(clf, [{ 'parameter': [0, 1, 2] }], scoring='accuracy', refit=False, error_score=float('nan'), webserver_url=self.live_server_url) # assert_warns(FitFailedWarning, gs.fit, X, y) wait(gs.fit(X, y)) assert all( np.all(np.isnan(this_point.cv_validation_scores)) for this_point in gs.grid_scores_ if this_point.parameters['parameter'] == FailingClassifier.FAILING_PARAMETER)
def test_unsupervised_grid_search(self): # test grid-search with unsupervised estimator X, y = make_blobs(random_state=0) km = KMeans(random_state=0) grid_search = ATGridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]), scoring='adjusted_rand_score', webserver_url=self.live_server_url) wait(grid_search.fit(X, y)) # ARI can find the right number :) assert_equal(grid_search.best_params_["n_clusters"], 3) # Now without a score, and without y grid_search = ATGridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]), webserver_url=self.live_server_url) wait(grid_search.fit(X)) assert_equal(grid_search.best_params_["n_clusters"], 4)
def test_grid_search_sparse(self): # Test that grid search works with both dense and sparse matrices X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = ATGridSearchCV(clf, {'C': [0.1, 1.0]}, webserver_url=self.live_server_url) wait(cv.fit(X_[:180], y_[:180])) y_pred = cv.best_estimator_.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = ATGridSearchCV(clf, {'C': [0.1, 1.0]}, webserver_url=self.live_server_url) wait(cv.fit(X_[:180].tocoo(), y_[:180])) y_pred2 = cv.best_estimator_.predict(X_[180:]) C2 = cv.best_estimator_.C assert_true(np.mean(y_pred == y_pred2) >= .9) assert_equal(C, C2)
def test_grid_search_sparse_scoring(self): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = ATGridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1", webserver_url=self.live_server_url) wait(cv.fit(X_[:180], y_[:180])) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = ATGridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1", webserver_url=self.live_server_url) wait(cv.fit(X_[:180], y_[:180])) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2) # Smoke test the score # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = ATGridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss, webserver_url=self.live_server_url) wait(cv.fit(X_[:180], y_[:180])) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert_equal(C, C3) assert_array_equal(y_pred, y_pred3)
def test_grid_search_score_method(self): X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0) clf = LinearSVC(random_state=0) grid = {'C': [.1]} search_no_scoring = ATGridSearchCV(clf, grid, scoring=None, webserver_url=self.live_server_url) wait(search_no_scoring.fit(X, y)) search_accuracy = ATGridSearchCV(clf, grid, scoring='accuracy', webserver_url=self.live_server_url) wait(search_accuracy.fit(X, y)) search_no_score_method_auc = ATGridSearchCV( LinearSVCNoScore(), grid, scoring='roc_auc', webserver_url=self.live_server_url) wait(search_no_score_method_auc.fit(X, y)) search_auc = ATGridSearchCV(clf, grid, scoring='roc_auc', webserver_url=self.live_server_url) wait(search_auc.fit(X, y)) # ChangedBehaviourWarning occurred previously (prior to #9005) score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y) score_accuracy = assert_no_warnings(search_accuracy.score, X, y) score_no_score_auc = assert_no_warnings( search_no_score_method_auc.score, X, y) score_auc = assert_no_warnings(search_auc.score, X, y) # ensure the test is sane assert_true(score_auc < 1.0) assert_true(score_accuracy < 1.0) assert_not_equal(score_auc, score_accuracy) assert_almost_equal(score_accuracy, score_no_scoring) assert_almost_equal(score_auc, score_no_score_auc)
def test_grid_search(self): # Test that the best estimator contains the right value for foo_param clf = MockClassifier() grid_search = ATGridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3, webserver_url=self.live_server_url) # make sure it selects the smallest parameter in case of ties old_stdout = sys.stdout sys.stdout = StringIO() wait(grid_search.fit(X, y)) sys.stdout = old_stdout self.assertIn(grid_search.best_estimator_.foo_param, [2, 3]) _mock_sort = partial(_sort_grid_scores, param='foo_param') for idx, tup in enumerate( sorted(grid_search.grid_scores_, key=cmp_to_key(_mock_sort))): self.assertEqual(tup[0], {'foo_param': idx + 1}, '%d%s' % (idx, tup)) # Smoke test the score etc: grid_search.score(X, y) grid_search.predict_proba(X) grid_search.decision_function(X) grid_search.transform(X) # Test exception handling on scoring grid_search.scoring = 'sklearn' self.assertRaises(ValueError, grid_search.fit, X, y)