def test_grid_search_allows_nans(): # Test dcv.GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] imputer = SimpleImputer(strategy="mean", missing_values=np.nan) p = Pipeline([("imputer", imputer), ("classifier", MockClassifier())]) dcv.GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y)
def test_grid_search_precomputed_kernel_error_nonsquare(): # Test that grid search returns an error with a non-square precomputed # training kernel matrix K_train = np.zeros((10, 20)) y_train = np.ones((10, )) clf = SVC(kernel="precomputed") cv = dcv.GridSearchCV(clf, {"C": [0.1, 1.0]}) with pytest.raises(ValueError): cv.fit(K_train, y_train)
def test_grid_search_failing_classifier(): X, y = make_classification(n_samples=20, n_features=10, random_state=0) clf = FailingClassifier() # refit=False because we want to test the behaviour of the grid search part gs = dcv.GridSearchCV(clf, [{ 'parameter': [0, 1, 2] }], scoring='accuracy', refit=False, error_score=0.0) with pytest.warns(FitFailedWarning): gs.fit(X, y) n_candidates = len(gs.cv_results_['params']) # Ensure that grid scores were set to zero as required for those fits # that are expected to fail. def get_cand_scores(i): return np.array( list(gs.cv_results_['split%d_test_score' % s][i] for s in range(gs.n_splits_))) assert all((np.all(get_cand_scores(cand_i) == 0.0) for cand_i in range(n_candidates) if gs.cv_results_['param_parameter'][cand_i] == FailingClassifier.FAILING_PARAMETER)) gs = dcv.GridSearchCV(clf, [{ 'parameter': [0, 1, 2] }], scoring='accuracy', refit=False, error_score=float('nan')) with pytest.warns(FitFailedWarning): gs.fit(X, y) n_candidates = len(gs.cv_results_['params']) assert all( np.all(np.isnan(get_cand_scores(cand_i))) for cand_i in range(n_candidates) if gs.cv_results_['param_parameter'] [cand_i] == FailingClassifier.FAILING_PARAMETER)
def test_grid_search_allows_nans(): # Test dcv.GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) dcv.GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
def test_search_cv_results_none_param(): X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1] estimators = (DecisionTreeRegressor(), DecisionTreeClassifier()) est_parameters = {"random_state": [0, None]} cv = KFold(random_state=0, n_splits=2, shuffle=True) for est in estimators: grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv).fit(X, y) assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None])
def test_scheduler_param(scheduler, n_jobs): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( MockClassifier(), {"foo_param": [0, 1, 2]}, cv=3, scheduler=scheduler, n_jobs=n_jobs, ) gs.fit(X, y)
def test_search_basic(xy_classification): X, y = xy_classification param_grid = {'class_weight': [None, 'balanced']} a = dms.GridSearchCV(SVC(kernel='rbf'), param_grid) a.fit(X, y) param_dist = {'C': stats.uniform} b = dms.RandomizedSearchCV(SVC(kernel='rbf'), param_dist) b.fit(X, y)
def test_search_basic(xy_classification): X, y = xy_classification param_grid = {"class_weight": [None, "balanced"]} a = dms.GridSearchCV(SVC(kernel="rbf", gamma=0.1), param_grid) a.fit(X, y) param_dist = {"C": stats.uniform} b = dms.RandomizedSearchCV(SVC(kernel="rbf", gamma=0.1), param_dist) b.fit(X, y)
def test_grid_search_sparse(): # Test that grid search works with both dense and sparse matrices X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = dcv.GridSearchCV(clf, {"C": [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = dcv.GridSearchCV(clf, {"C": [0.1, 1.0]}) cv.fit(X_[:180].tocoo(), y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert np.mean(y_pred == y_pred2) >= 0.9 assert C == C2
def test_y_as_list(): # Pass y as list in dcv.GridSearchCV X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) clf = CheckingClassifier(check_y=lambda x: isinstance(x, list)) cv = KFold(n_splits=3) grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv) grid_search.fit(X, y.tolist()).score(X, y) assert hasattr(grid_search, "cv_results_")
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(return_indicator=True, random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(random_state=0, n_splits=3, shuffle=True) estimators = [ DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0), ] scoring = sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score, average="weighted") # Test with grid search cv for est in estimators: grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv, scoring=scoring) grid_search.fit(X, y) res_params = grid_search.cv_results_["params"] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = scoring(est, X[test], y[test]) assert_almost_equal( correct_score, grid_search.cv_results_["split%d_test_score" % i][cand_i], ) # Test with a randomized search for est in estimators: random_search = dcv.RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3, scoring=scoring) random_search.fit(X, y) res_params = random_search.cv_results_["params"] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = scoring(est, X[test], y[test]) assert_almost_equal( correct_score, random_search.cv_results_["split%d_test_score" % i][cand_i], )
def test_trivial_cv_results_attr(): # Test search over a "grid" with only one point. # Non-regression test: grid_scores_ wouldn't be set by dcv.GridSearchCV. clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {"foo_param": [1]}) grid_search.fit(X, y) assert hasattr(grid_search, "cv_results_") random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1) random_search.fit(X, y) assert hasattr(grid_search, "cv_results_")
def test_search_train_scores_set_to_false(): X = np.arange(6).reshape(6, -1) y = [0, 0, 0, 1, 1, 1] clf = LinearSVC(random_state=0) gs = dcv.GridSearchCV(clf, param_grid={"C": [0.1, 0.2]}, return_train_score=False) gs.fit(X, y) for key in gs.cv_results_: assert not key.endswith("train_score")
def test_refit(): # Regression test for bug in refitting # Simulates re-fitting a broken estimator; this used to break with # sparse SVMs. X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) clf = dcv.GridSearchCV( BrokenClassifier(), [{"parameter": [0, 1]}], scoring="accuracy", refit=True ) clf.fit(X, y)
def test_gridsearch_nd(): # Pass X as list in dcv.GridSearchCV X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) clf = CheckingClassifier( check_X=lambda x: x.shape[1:] == (5, 3, 2), check_y=lambda x: x.shape[1:] == (7, 11), ) grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}) grid_search.fit(X_4d, y_3d).score(X, y) assert hasattr(grid_search, "cv_results_")
def test_scheduler_param_distributed(loop): # noqa X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): with Client(s["address"], loop=loop) as client: gs = dcv.GridSearchCV(MockClassifier(), {"foo_param": [0, 1, 2]}, cv=3) gs.fit(X, y) def f(dask_scheduler): return len(dask_scheduler.transition_log) assert client.run_on_scheduler(f) # some work happened on cluster
def test_grid_search_one_grid_point(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]} clf = SVC() cv = dcv.GridSearchCV(clf, param_dict) cv.fit(X_, y_) clf = SVC(C=1.0, kernel="rbf", gamma=0.1) clf.fit(X_, y_) assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
def test_visualize(): pytest.importorskip('graphviz') X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0) clf = SVC(random_state=0) grid = {'C': [.1, .5, .9]} gs = dcv.GridSearchCV(clf, grid).fit(X, y) assert hasattr(gs, 'dask_graph_') with tmpdir() as d: gs.visualize(filename=os.path.join(d, 'mydask')) assert os.path.exists(os.path.join(d, 'mydask.png')) # Doesn't work if not fitted gs = dcv.GridSearchCV(clf, grid) with pytest.raises(NotFittedError): gs.visualize()
def test_visualize(): pytest.importorskip("graphviz") X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0) clf = SVC(random_state=0, gamma="auto") grid = {"C": [0.1, 0.5, 0.9]} gs = dcv.GridSearchCV(clf, grid).fit(X, y) assert hasattr(gs, "dask_graph_") with tmpdir() as d: gs.visualize(filename=os.path.join(d, "mydask")) assert os.path.exists(os.path.join(d, "mydask.png")) # Doesn't work if not fitted gs = dcv.GridSearchCV(clf, grid) with pytest.raises(NotFittedError): gs.visualize()
def test_gridsearch_no_predict(): # test grid-search with an estimator without predict. # slight duplication of a test from KDE def custom_scoring(estimator, X): return 42 if estimator.bandwidth == .1 else 0 X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) search = dcv.GridSearchCV( KernelDensity(), param_grid=dict(bandwidth=[.01, .1, 1]), scoring=custom_scoring ) search.fit(X) assert search.best_params_["bandwidth"] == .1 assert search.best_score_ == 42
def test_cv_multiplemetrics_requires_refit_metric(): X, y = make_classification(random_state=0) param_grid = {"max_depth": [1, 5]} a = dcv.GridSearchCV( RandomForestClassifier(n_estimators=10), param_grid, refit=True, scoring={"score1": "accuracy", "score2": "accuracy"}, ) with pytest.raises(ValueError): a.fit(X, y)
def test_cv_multiplemetrics_requires_refit_metric(): X, y = make_classification(random_state=0) param_grid = {'max_depth': [1, 5]} a = dcv.GridSearchCV(RandomForestClassifier(), param_grid, refit=True, scoring={ 'score1': 'accuracy', 'score2': 'accuracy' }) with pytest.raises(ValueError): a.fit(X, y)
def test_classes__property(): # Test that classes_ property matches best_estimator_.classes_ X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) Cs = [0.1, 1, 10] grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {"C": Cs}) grid_search.fit(X, y) assert_array_equal(grid_search.best_estimator_.classes_, grid_search.classes_) # Test that regressors do not have a classes_ attribute grid_search = dcv.GridSearchCV(Ridge(), {"alpha": [1.0, 2.0]}) grid_search.fit(X, y) assert not hasattr(grid_search, "classes_") # Test that the grid searcher has no classes_ attribute before it's fit grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {"C": Cs}) assert not hasattr(grid_search, "classes_") # Test that the grid searcher has no classes_ attribute without a refit grid_search = dcv.GridSearchCV(LinearSVC(random_state=0), {"C": Cs}, refit=False) grid_search.fit(X, y) assert not hasattr(grid_search, "classes_")
def test_grid_search_bad_param_grid(): param_dict = {"C": 1.0} clf = SVC() with pytest.raises(ValueError): dcv.GridSearchCV(clf, param_dict) param_dict = {"C": []} clf = SVC() with pytest.raises(ValueError): dcv.GridSearchCV(clf, param_dict) param_dict = {"C": "1,2,3"} clf = SVC() with pytest.raises(ValueError): dcv.GridSearchCV(clf, param_dict) param_dict = {"C": np.ones(6).reshape(3, 2)} clf = SVC() with pytest.raises(ValueError): dcv.GridSearchCV(clf, param_dict)
def test_return_train_score_warn(): # Test that warnings are raised. Will be removed in sklearn 0.21 X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) X = (X - X.mean(0)) / X.std(0) # help convergence grid = {"C": [0.1, 0.5]} for val in [True, False]: est = dcv.GridSearchCV(LinearSVC(random_state=0, tol=0.5), grid, return_train_score=val) with pytest.warns(None) as warns: results = est.fit(X, y).cv_results_ assert not warns assert type(results) is dict est = dcv.GridSearchCV(LinearSVC(random_state=0), grid) with pytest.warns(None) as warns: results = est.fit(X, y).cv_results_ assert not warns train_keys = { "split0_train_score", "split1_train_score", "split2_train_score", "mean_train_score", "std_train_score", } for key in results: if key in train_keys: with pytest.warns(FutureWarning): results[key] else: with pytest.warns(None) as warns: results[key] assert not warns
def test_grid_search_failing_classifier_raise(): X, y = make_classification(n_samples=20, n_features=10, random_state=0) clf = FailingClassifier() # refit=False because we want to test the behaviour of the grid search part gs = dcv.GridSearchCV(clf, [{ 'parameter': [0, 1, 2] }], scoring='accuracy', refit=False, error_score='raise') # FailingClassifier issues a ValueError so this is what we look for. with pytest.raises(ValueError): gs.fit(X, y)
def test_cache_cv(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) X2 = X.view(CountTakes) gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, cache_cv=False, scheduler='sync') gs.fit(X2, y) assert X2.count == 2 * 3 * 3 # (1 train + 1 test) * n_params * n_splits X2 = X.view(CountTakes) assert X2.count == 0 gs.cache_cv = True gs.fit(X2, y) assert X2.count == 2 * 3 # (1 test + 1 train) * n_splits
def test_gridsearch_with_arraylike_fit_param(cache_cv): # https://github.com/dask/dask-ml/issues/319 X, y = make_classification(random_state=0) param_grid = {"foo_param": [0.0001, 0.1]} a = dcv.GridSearchCV( MockClassifierWithFitParam(), param_grid, cv=3, refit=False, cache_cv=cache_cv, ) b = GridSearchCV(MockClassifierWithFitParam(), param_grid, cv=3, refit=False) b.fit(X, y, mock_fit_param=[0, 1]) a.fit(X, y, mock_fit_param=[0, 1])
def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() grid_search = dcv.GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True) grid_search.fit(X, y) grid_search_pickled = pickle.loads(pickle.dumps(grid_search)) assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X)) random_search = dcv.RandomizedSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True, n_iter=3) random_search.fit(X, y) random_search_pickled = pickle.loads(pickle.dumps(random_search)) assert_array_almost_equal(random_search.predict(X), random_search_pickled.predict(X))
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(return_indicator=True, random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(random_state=0) estimators = [ DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0) ] # Test with grid search cv for est in estimators: grid_search = dcv.GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) res_params = grid_search.cv_results_['params'] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal( correct_score, grid_search.cv_results_['split%d_test_score' % i][cand_i]) # Test with a randomized search for est in estimators: random_search = dcv.RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) res_params = random_search.cv_results_['params'] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal( correct_score, random_search.cv_results_['split%d_test_score' % i][cand_i])