Exemplo n.º 1
0
def test_validation_curve_cv_splits_consistency():
    n_samples = 100
    n_splits = 5
    X, y = make_classification(n_samples=100, random_state=0)

    scores1 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
                               'C', [0.1, 0.1, 0.2, 0.2],
                               cv=OneTimeSplitter(n_splits=n_splits,
                                                  n_samples=n_samples))
    # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the
    # `split` is called for each parameter, the following should produce
    # identical results for param setting 1 and param setting 2 as both have
    # the same C value.
    assert_array_almost_equal(*np.vsplit(np.hstack(scores1)[(0, 2, 1, 3), :],
                                         2))

    scores2 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
                               'C', [0.1, 0.1, 0.2, 0.2],
                               cv=KFold(n_splits=n_splits, shuffle=True))

    # For scores2, compare the 1st and 2nd parameter's scores
    # (Since the C value for 1st two param setting is 0.1, they must be
    # consistent unless the train test folds differ between the param settings)
    assert_array_almost_equal(*np.vsplit(np.hstack(scores2)[(0, 2, 1, 3), :],
                                         2))

    scores3 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
                               'C', [0.1, 0.1, 0.2, 0.2],
                               cv=KFold(n_splits=n_splits))

    # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check.
    assert_array_almost_equal(np.array(scores3), np.array(scores1))
Exemplo n.º 2
0
def test_learning_curve():
    n_samples = 30
    n_splits = 3
    X, y = make_classification(n_samples=n_samples, n_features=1,
                               n_informative=1, n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(n_samples * ((n_splits - 1) / n_splits))
    for shuffle_train in [False, True]:
        with warnings.catch_warnings(record=True) as w:
            train_sizes, train_scores, test_scores = learning_curve(
                estimator, X, y, cv=KFold(n_splits=n_splits),
                train_sizes=np.linspace(0.1, 1.0, 10),
                shuffle=shuffle_train)
        if len(w) > 0:
            raise RuntimeError("Unexpected warning: %r" % w[0].message)
        assert_equal(train_scores.shape, (10, 3))
        assert_equal(test_scores.shape, (10, 3))
        assert_array_equal(train_sizes, np.linspace(2, 20, 10))
        assert_array_almost_equal(train_scores.mean(axis=1),
                                  np.linspace(1.9, 1.0, 10))
        assert_array_almost_equal(test_scores.mean(axis=1),
                                  np.linspace(0.1, 1.0, 10))

        # Test a custom cv splitter that can iterate only once
        with warnings.catch_warnings(record=True) as w:
            train_sizes2, train_scores2, test_scores2 = learning_curve(
                estimator, X, y,
                cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples),
                train_sizes=np.linspace(0.1, 1.0, 10),
                shuffle=shuffle_train)
        if len(w) > 0:
            raise RuntimeError("Unexpected warning: %r" % w[0].message)
        assert_array_almost_equal(train_scores2, train_scores)
        assert_array_almost_equal(test_scores2, test_scores)
Exemplo n.º 3
0
def test_grid_search_cv_splits_consistency():
    # Check if a one time iterable is accepted as a cv parameter.
    n_samples = 100
    n_splits = 5
    X, y = make_classification(n_samples=n_samples, random_state=0)

    gs = GridSearchCV(LinearSVC(random_state=0),
                      param_grid={'C': [0.1, 0.2, 0.3]},
                      cv=OneTimeSplitter(n_splits=n_splits,
                                         n_samples=n_samples))
    gs.fit(X, y)

    gs2 = GridSearchCV(LinearSVC(random_state=0),
                       param_grid={'C': [0.1, 0.2, 0.3]},
                       cv=KFold(n_splits=n_splits))
    gs2.fit(X, y)

    def _pop_time_keys(cv_results):
        for key in ('mean_fit_time', 'std_fit_time', 'mean_score_time',
                    'std_score_time'):
            cv_results.pop(key)
        return cv_results

    # OneTimeSplitter is a non-re-entrant cv where split can be called only
    # once if ``cv.split`` is called once per param setting in GridSearchCV.fit
    # the 2nd and 3rd parameter will not be evaluated as no train/test indices
    # will be generated for the 2nd and subsequent cv.split calls.
    # This is a check to make sure cv.split is not called once per param
    # setting.
    np.testing.assert_equal(_pop_time_keys(gs.cv_results_),
                            _pop_time_keys(gs2.cv_results_))

    # Check consistency of folds across the parameters
    gs = GridSearchCV(LinearSVC(random_state=0),
                      param_grid={'C': [0.1, 0.1, 0.2, 0.2]},
                      cv=KFold(n_splits=n_splits, shuffle=True))
    gs.fit(X, y)

    # As the first two param settings (C=0.1) and the next two param
    # settings (C=0.2) are same, the test and train scores must also be
    # same as long as the same train/test indices are generated for all
    # the cv splits, for both param setting
    for score_type in ('train', 'test'):
        per_param_scores = {}
        for param_i in range(4):
            per_param_scores[param_i] = list(
                gs.cv_results_['split%d_%s_score' % (s, score_type)][param_i]
                for s in range(5))

        assert_array_almost_equal(per_param_scores[0], per_param_scores[1])
        assert_array_almost_equal(per_param_scores[2], per_param_scores[3])