Exemplo n.º 1
0
def test_bad_error_score():
    X, y = make_classification(n_samples=100, n_features=10, random_state=0)
    gs = DaskGridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]},
                          error_score='badparam')

    with pytest.raises(ValueError):
        gs.fit(X, y)
Exemplo n.º 2
0
def test_grid_search_bad_param_grid():
    param_dict = {"C": 1.0}
    clf = SVC()

    with pytest.raises(ValueError) as exc:
        DaskGridSearchCV(clf, param_dict)
    assert ("Parameter values for parameter (C) need to be a sequence"
            "(but not a string) or np.ndarray.") in str(exc.value)

    param_dict = {"C": []}
    clf = SVC()

    with pytest.raises(ValueError) as exc:
        DaskGridSearchCV(clf, param_dict)
    assert ("Parameter values for parameter (C) need to be a non-empty "
            "sequence.") in str(exc.value)

    param_dict = {"C": "1,2,3"}
    clf = SVC()

    with pytest.raises(ValueError) as exc:
        DaskGridSearchCV(clf, param_dict)
    assert ("Parameter values for parameter (C) need to be a sequence"
            "(but not a string) or np.ndarray.") in str(exc.value)

    param_dict = {"C": np.ones(6).reshape(3, 2)}
    clf = SVC()
    with pytest.raises(ValueError):
        DaskGridSearchCV(clf, param_dict)
Exemplo n.º 3
0
def test_search_train_scores_set_to_false():
    X = np.arange(6).reshape(6, -1)
    y = [0, 0, 0, 1, 1, 1]
    clf = LinearSVC(random_state=0)

    gs = DaskGridSearchCV(clf, param_grid={'C': [0.1, 0.2]},
                          return_train_score=False)
    gs.fit(X, y)
Exemplo n.º 4
0
def test_grid_search_error():
    # Test that grid search will capture errors on data with different length
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]})
    with pytest.raises(ValueError):
        cv.fit(X_[:180], y_)
Exemplo n.º 5
0
def test_grid_search_precomputed_kernel_error_nonsquare():
    # Test that grid search returns an error with a non-square precomputed
    # training kernel matrix
    K_train = np.zeros((10, 20))
    y_train = np.ones((10, ))
    clf = SVC(kernel='precomputed')
    cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]})
    with pytest.raises(ValueError):
        cv.fit(K_train, y_train)
Exemplo n.º 6
0
def test_gridsearch_nd():
    # Pass X as list in DaskGridSearchCV
    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
    clf = CheckingClassifier(check_X=lambda x: x.shape[1:] == (5, 3, 2),
                             check_y=lambda x: x.shape[1:] == (7, 11))
    grid_search = DaskGridSearchCV(clf, {'foo_param': [1, 2, 3]})
    grid_search.fit(X_4d, y_3d).score(X, y)
    assert hasattr(grid_search, "cv_results_")
Exemplo n.º 7
0
def test_refit():
    # Regression test for bug in refitting
    # Simulates re-fitting a broken estimator; this used to break with
    # sparse SVMs.
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)

    clf = DaskGridSearchCV(BrokenClassifier(), [{'parameter': [0, 1]}],
                           scoring="precision", refit=True)
    clf.fit(X, y)
Exemplo n.º 8
0
def test_y_as_list():
    # Pass y as list in DaskGridSearchCV
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)

    clf = CheckingClassifier(check_y=lambda x: isinstance(x, list))
    cv = KFold(n_splits=3)
    grid_search = DaskGridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
    grid_search.fit(X, y.tolist()).score(X, y)
    assert hasattr(grid_search, "cv_results_")
Exemplo n.º 9
0
def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to DaskGridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = DaskGridSearchCV(clf, grid, cv=cv)

        with pytest.raises(ValueError) as exc:
            assert gs.fit(X, y)
        assert "The groups parameter should not be None" in str(exc.value)

        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = DaskGridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)
Exemplo n.º 10
0
def test_trivial_cv_results_attr():
    # Test search over a "grid" with only one point.
    # Non-regression test: grid_scores_ wouldn't be set by DaskGridSearchCV.
    clf = MockClassifier()
    grid_search = DaskGridSearchCV(clf, {'foo_param': [1]})
    grid_search.fit(X, y)
    assert hasattr(grid_search, "cv_results_")

    random_search = DaskRandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1)
    random_search.fit(X, y)
    assert hasattr(grid_search, "cv_results_")
Exemplo n.º 11
0
def test_grid_search_failing_classifier_raise():
    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
    clf = FailingClassifier()

    # refit=False because we want to test the behaviour of the grid search part
    gs = DaskGridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
                          refit=False, error_score='raise')

    # FailingClassifier issues a ValueError so this is what we look for.
    with pytest.raises(ValueError):
        gs.fit(X, y)
Exemplo n.º 12
0
def test_grid_search_one_grid_point():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
    param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}

    clf = SVC()
    cv = DaskGridSearchCV(clf, param_dict)
    cv.fit(X_, y_)

    clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
    clf.fit(X_, y_)

    assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
Exemplo n.º 13
0
def test_gridsearch_no_predict():
    # test grid-search with an estimator without predict.
    # slight duplication of a test from KDE
    def custom_scoring(estimator, X):
        return 42 if estimator.bandwidth == .1 else 0
    X, _ = make_blobs(cluster_std=.1, random_state=1,
                      centers=[[0, 1], [1, 0], [0, 0]])
    search = DaskGridSearchCV(KernelDensity(),
                              param_grid=dict(bandwidth=[.01, .1, 1]),
                              scoring=custom_scoring)
    search.fit(X)
    assert search.best_params_['bandwidth'] == .1
    assert search.best_score_ == 42
Exemplo n.º 14
0
def test_grid_search_failing_classifier():
    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
    clf = FailingClassifier()

    # refit=False because we want to test the behaviour of the grid search part
    gs = DaskGridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
                          refit=False, error_score=0.0)

    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    n_candidates = len(gs.cv_results_['params'])

    # Ensure that grid scores were set to zero as required for those fits
    # that are expected to fail.
    def get_cand_scores(i):
        return np.array(list(gs.cv_results_['split%d_test_score' % s][i]
                             for s in range(gs.n_splits_)))

    assert all((np.all(get_cand_scores(cand_i) == 0.0)
                for cand_i in range(n_candidates)
                if gs.cv_results_['param_parameter'][cand_i] ==
                FailingClassifier.FAILING_PARAMETER))

    gs = DaskGridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
                          refit=False, error_score=float('nan'))

    with pytest.warns(FitFailedWarning):
        gs.fit(X, y)

    n_candidates = len(gs.cv_results_['params'])
    assert all(np.all(np.isnan(get_cand_scores(cand_i)))
               for cand_i in range(n_candidates)
               if gs.cv_results_['param_parameter'][cand_i] ==
               FailingClassifier.FAILING_PARAMETER)
Exemplo n.º 15
0
def test_pickle():
    # Test that a fit search can be pickled
    clf = MockClassifier()
    grid_search = DaskGridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True)
    grid_search.fit(X, y)
    grid_search_pickled = pickle.loads(pickle.dumps(grid_search))
    assert_array_almost_equal(grid_search.predict(X),
                              grid_search_pickled.predict(X))

    random_search = DaskRandomizedSearchCV(clf, {'foo_param': [1, 2, 3]},
                                           refit=True, n_iter=3)
    random_search.fit(X, y)
    random_search_pickled = pickle.loads(pickle.dumps(random_search))
    assert_array_almost_equal(random_search.predict(X),
                              random_search_pickled.predict(X))
Exemplo n.º 16
0
def test_predict_proba_disabled():
    # Test predict_proba when disabled on estimator.
    X = np.arange(20).reshape(5, -1)
    y = [0, 0, 1, 1, 1]
    clf = SVC(probability=False)
    gs = DaskGridSearchCV(clf, {}, cv=2).fit(X, y)
    assert not hasattr(gs, "predict_proba")
Exemplo n.º 17
0
def test_search_cv_results_rank_tie_breaking():
    X, y = make_blobs(n_samples=50, random_state=42)

    # The two C values are close enough to give similar models
    # which would result in a tie of their mean cv-scores
    param_grid = {'C': [1, 1.001, 0.001]}

    grid_search = DaskGridSearchCV(SVC(), param_grid=param_grid)
    random_search = DaskRandomizedSearchCV(SVC(), n_iter=3,
                                           param_distributions=param_grid)

    for search in (grid_search, random_search):
        search.fit(X, y)
        cv_results = search.cv_results_
        # Check tie breaking strategy -
        # Check that there is a tie in the mean scores between
        # candidates 1 and 2 alone
        assert_almost_equal(cv_results['mean_test_score'][0],
                            cv_results['mean_test_score'][1])
        assert_almost_equal(cv_results['mean_train_score'][0],
                            cv_results['mean_train_score'][1])
        try:
            assert_almost_equal(cv_results['mean_test_score'][1],
                                cv_results['mean_test_score'][2])
        except AssertionError:
            pass
        try:
            assert_almost_equal(cv_results['mean_train_score'][1],
                                cv_results['mean_train_score'][2])
        except AssertionError:
            pass
        # 'min' rank should be assigned to the tied candidates
        assert_almost_equal(search.cv_results_['rank_test_score'], [1, 1, 3])
Exemplo n.º 18
0
def test_no_refit():
    # Test that GSCV can be used for model selection alone without refitting
    clf = MockClassifier()
    grid_search = DaskGridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False)
    grid_search.fit(X, y)
    assert (not hasattr(grid_search, "best_estimator_") and
                hasattr(grid_search, "best_index_") and
                hasattr(grid_search, "best_params_"))

    # Make sure the predict/transform etc fns raise meaningfull error msg
    for fn_name in ('predict', 'predict_proba', 'predict_log_proba',
                    'transform', 'inverse_transform'):
        with pytest.raises(NotFittedError) as exc:
            getattr(grid_search, fn_name)(X)
        assert (('refit=False. %s is available only after refitting on the '
                 'best parameters' % fn_name) in str(exc.value))
Exemplo n.º 19
0
def test_grid_search_precomputed_kernel():
    # Test that grid search works when the input features are given in the
    # form of a precomputed kernel matrix
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    # compute the training kernel matrix corresponding to the linear kernel
    K_train = np.dot(X_[:180], X_[:180].T)
    y_train = y_[:180]

    clf = SVC(kernel='precomputed')
    cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(K_train, y_train)

    assert cv.best_score_ >= 0

    # compute the test kernel matrix
    K_test = np.dot(X_[180:], X_[:180].T)
    y_test = y_[180:]

    y_pred = cv.predict(K_test)

    assert np.mean(y_pred == y_test) >= 0

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    with pytest.raises(ValueError):
        cv.fit(K_train.tolist(), y_train)
Exemplo n.º 20
0
def test_grid_search_allows_nans():
    # Test DaskGridSearchCV with Imputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    DaskGridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
Exemplo n.º 21
0
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(random_state=0)

    estimators = [DecisionTreeRegressor(random_state=0),
                  DecisionTreeClassifier(random_state=0)]

    # Test with grid search cv
    for est in estimators:
        grid_search = DaskGridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        res_params = grid_search.cv_results_['params']
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    grid_search.cv_results_['split%d_test_score' % i][cand_i])

    # Test with a randomized search
    for est in estimators:
        random_search = DaskRandomizedSearchCV(est, est_parameters,
                                               cv=cv, n_iter=3)
        random_search.fit(X, y)
        res_params = random_search.cv_results_['params']
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    random_search.cv_results_['split%d_test_score'
                                              % i][cand_i])
Exemplo n.º 22
0
def test_grid_search_sparse():
    # Test that grid search works with both dense and sparse matrices
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180].tocoo(), y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert np.mean(y_pred == y_pred2) >= .9
    assert C == C2
Exemplo n.º 23
0
def test_search_cv_results_none_param():
    X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1]
    estimators = (DecisionTreeRegressor(), DecisionTreeClassifier())
    est_parameters = {"random_state": [0, None]}
    cv = KFold(random_state=0)

    for est in estimators:
        grid_search = DaskGridSearchCV(est, est_parameters, cv=cv).fit(X, y)
        assert_array_equal(grid_search.cv_results_['param_random_state'],
                           [0, None])
Exemplo n.º 24
0
def test_pipeline_feature_union():
    iris = load_iris()
    X, y = iris.data, iris.target

    pca = PCA(random_state=0)
    kbest = SelectKBest()
    empty_union = FeatureUnion([('first', None), ('second', None)])
    empty_pipeline = Pipeline([('first', None), ('second', None)])
    svc = SVC(kernel='linear', random_state=0)

    pipe = Pipeline([('empty_pipeline', empty_pipeline), ('missing', None),
                     ('union',
                      FeatureUnion([('pca', pca), ('missing', None),
                                    ('kbest', kbest),
                                    ('empty_union', empty_union)],
                                   transformer_weights={'pca': 0.5})),
                     ('svc', svc)])

    param_grid = dict(union__pca__n_components=[1, 2, 3],
                      union__kbest__k=[1, 2],
                      svc__C=[0.1, 1, 10])

    gs = GridSearchCV(pipe, param_grid=param_grid)
    gs.fit(X, y)
    dgs = DaskGridSearchCV(pipe, param_grid=param_grid, get=dask.get)
    dgs.fit(X, y)

    # Check best params match
    assert gs.best_params_ == dgs.best_params_

    # Check PCA components match
    sk_pca = gs.best_estimator_.named_steps['union'].transformer_list[0][1]
    dk_pca = dgs.best_estimator_.named_steps['union'].transformer_list[0][1]
    np.testing.assert_allclose(sk_pca.components_, dk_pca.components_)

    # Check SelectKBest scores match
    sk_kbest = gs.best_estimator_.named_steps['union'].transformer_list[2][1]
    dk_kbest = dgs.best_estimator_.named_steps['union'].transformer_list[2][1]
    np.testing.assert_allclose(sk_kbest.scores_, dk_kbest.scores_)

    # Check SVC coefs match
    np.testing.assert_allclose(gs.best_estimator_.named_steps['svc'].coef_,
                               dgs.best_estimator_.named_steps['svc'].coef_)
Exemplo n.º 25
0
def test_grid_search_cv_results():
    X, y = make_classification(n_samples=50, n_features=4,
                               random_state=42)

    n_splits = 3
    n_grid_points = 6
    params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]),
              dict(kernel=['poly', ], degree=[1, 2])]
    grid_search = DaskGridSearchCV(SVC(), cv=n_splits, iid=False,
                                   param_grid=params)
    grid_search.fit(X, y)
    grid_search_iid = DaskGridSearchCV(SVC(), cv=n_splits, iid=True,
                                       param_grid=params)
    grid_search_iid.fit(X, y)

    param_keys = ('param_C', 'param_degree', 'param_gamma', 'param_kernel')
    score_keys = ('mean_test_score', 'mean_train_score',
                  'rank_test_score',
                  'split0_test_score', 'split1_test_score',
                  'split2_test_score',
                  'split0_train_score', 'split1_train_score',
                  'split2_train_score',
                  'std_test_score', 'std_train_score')
    n_candidates = n_grid_points

    for search, iid in zip((grid_search, grid_search_iid), (False, True)):
        assert iid == search.iid
        cv_results = search.cv_results_
        # Check if score and timing are reasonable
        assert all(cv_results['rank_test_score'] >= 1)
        assert all(all(cv_results[k] >= 0) for k in score_keys
                   if k != 'rank_test_score')
        assert all(all(cv_results[k] <= 1) for k in score_keys
                   if 'time' not in k and k != 'rank_test_score')
        # Check cv_results structure
        check_cv_results_array_types(cv_results, param_keys, score_keys)
        check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
        # Check masking
        cv_results = grid_search.cv_results_
        n_candidates = len(grid_search.cv_results_['params'])
        assert all((cv_results['param_C'].mask[i] and
                    cv_results['param_gamma'].mask[i] and
                    not cv_results['param_degree'].mask[i])
                    for i in range(n_candidates)
                    if cv_results['param_kernel'][i] == 'linear')
        assert all((not cv_results['param_C'].mask[i] and
                    not cv_results['param_gamma'].mask[i] and
                    cv_results['param_degree'].mask[i])
                    for i in range(n_candidates)
                    if cv_results['param_kernel'][i] == 'rbf')
Exemplo n.º 26
0
def test_unsupervised_grid_search():
    # test grid-search with unsupervised estimator
    X, y = make_blobs(random_state=0)
    km = KMeans(random_state=0)
    grid_search = DaskGridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
                                   scoring='adjusted_rand_score')
    grid_search.fit(X, y)
    # ARI can find the right number :)
    assert grid_search.best_params_["n_clusters"] == 3

    # Now without a score, and without y
    grid_search = DaskGridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
    grid_search.fit(X)
    assert grid_search.best_params_["n_clusters"] == 4
Exemplo n.º 27
0
def test_grid_search_dask_inputs():
    # Numpy versions
    np_X, np_y = make_classification(n_samples=15, n_classes=2, random_state=0)
    np_groups = np.random.RandomState(0).randint(0, 3, 15)
    # Dask array versions
    da_X = da.from_array(np_X, chunks=5)
    da_y = da.from_array(np_y, chunks=5)
    da_groups = da.from_array(np_groups, chunks=5)
    # Delayed versions
    del_X = delayed(np_X)
    del_y = delayed(np_y)
    del_groups = delayed(np_groups)

    cv = GroupKFold()
    clf = SVC(random_state=0)
    grid = {'C': [1]}

    sol = SVC(C=1, random_state=0).fit(np_X, np_y).support_vectors_

    for X, y, groups in product([np_X, da_X, del_X], [np_y, da_y, del_y],
                                [np_groups, da_groups, del_groups]):
        gs = DaskGridSearchCV(clf, grid, cv=cv)

        with pytest.raises(ValueError) as exc:
            gs.fit(X, y)
        assert "The groups parameter should not be None" in str(exc.value)

        gs.fit(X, y, groups=groups)
        np.testing.assert_allclose(sol, gs.best_estimator_.support_vectors_)
Exemplo n.º 28
0
def test_grid_search_correct_score_results():
    # test that correct scores are used
    n_splits = 3
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        # XXX: It seems there's some global shared state in LinearSVC - fitting
        # multiple `SVC` instances in parallel using threads sometimes results
        # in wrong results. This only happens with threads, not processes/sync.
        # For now, we'll fit using the sync scheduler.
        grid_search = DaskGridSearchCV(clf, {'C': Cs}, scoring=score,
                                       cv=n_splits, get=dask.get)
        cv_results = grid_search.fit(X, y).cv_results_

        # Test scorer names
        result_keys = list(cv_results.keys())
        expected_keys = (("mean_test_score", "rank_test_score") +
                         tuple("split%d_test_score" % cv_i
                               for cv_i in range(n_splits)))
        assert all(in1d(expected_keys, result_keys))

        cv = StratifiedKFold(n_splits=n_splits)
        n_splits = grid_search.n_splits_
        for candidate_i, C in enumerate(Cs):
            clf.set_params(C=C)
            cv_scores = np.array(
                list(grid_search.cv_results_['split%d_test_score'
                                             % s][candidate_i]
                     for s in range(n_splits)))
            for i, (train, test) in enumerate(cv.split(X, y)):
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, cv_scores[i])
Exemplo n.º 29
0
def main(argc, argv):
    '''
    grid = {
        'max_depth'         : range(1,12,2),
        'learning_rate'     : [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.2],
        'n_estimators'      : [100, 300, 400, 500, 600, 700, 800, 1000],
        'gamma'             : [0.5, 1, 1.5],
        #'min_child_weight'
        #'max_delta_step'
        'subsample'         : [0.2,0.4,0.66,0.8,1],
        'colsample_bytree'  : [0.25,0.5,0.7,0.9,1],
        #'colsample_bylevel': [0.2, 0.5, 0.7, 0.9, 1]
        'reg_alpha'         : np.arange(0.01, 1, 0.05)
        #'reg_lambda'       : np.arange(0.01, 1, 0.05)
        #'scale_pos_weight'
        #'base_score'
    },
    '''

    grid = {
        'max_depth': range(1, 12),
        'min_samples_split': range(2, 10),
        'min_samples_leaf': range(2, 10)
    }

    # model = XGBRegressor()
    model = DecisionTreeRegressor()
    store = pd.HDFStore('foobar.hdf5', 'r')

    train_x = store['train_x']
    train_y = store['train_y']

    print('train_x shape: {}'.format(train_x.shape))
    print('train_y shape: {}'.format(train_y.shape))

    from distributed import Executor, Client
    from dklearn import DaskGridSearchCV

    print('doing grid search')
    scheduler_address = '173.208.222.74:8877'
    # exc = Executor(scheduler_address, set_as_default=True)
    c = Client(scheduler_address, set_as_default=True)
    gs = DaskGridSearchCV(model, grid).fit(train_x, train_y)
    # c = Client(scheduler_address, set_as_default=True)
    #gs = DaskGridSearchCV(model, grid, get = c.get).fit(train_x, train_y)

    pass
Exemplo n.º 30
0
def test_visualize():
    pytest.importorskip('graphviz')

    X, y = make_classification(n_samples=100,
                               n_classes=2,
                               flip_y=.2,
                               random_state=0)
    clf = SVC(random_state=0)
    grid = {'C': [.1, .5, .9]}
    gs = DaskGridSearchCV(clf, grid).fit(X, y)

    assert hasattr(gs, 'dask_graph_')

    with tmpdir() as d:
        gs.visualize(filename=os.path.join(d, 'mydask'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))

    # Doesn't work if not fitted
    gs = DaskGridSearchCV(clf, grid)
    with pytest.raises(NotFittedError):
        gs.visualize()