Пример #1
0
def test_grid_search_labels():
    # Check if ValueError (when labels is None) propagates to GridSearchCV
    # And also check if labels is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    labels = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    label_cvs = [
        LeaveOneLabelOut(),
        LeavePLabelOut(2),
        LabelKFold(),
        LabelShuffleSplit()
    ]
    for cv in label_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The labels parameter should not be None", gs.fit,
                             X, y)
        gs.fit(X, y, labels)

    non_label_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_label_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)
def test_cross_val_score_predict_labels():
    # Check if ValueError (when labels is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if labels is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    label_cvs = [
        LeaveOneLabelOut(),
        LeavePLabelOut(2),
        LabelKFold(),
        LabelShuffleSplit()
    ]
    for cv in label_cvs:
        assert_raise_message(ValueError,
                             "The labels parameter should not be None",
                             cross_val_score,
                             estimator=clf,
                             X=X,
                             y=y,
                             cv=cv)
        assert_raise_message(ValueError,
                             "The labels parameter should not be None",
                             cross_val_predict,
                             estimator=clf,
                             X=X,
                             y=y,
                             cv=cv)
Пример #3
0
def test_label_shuffle_split():
    labels = [
        np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
        np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
        np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4])
    ]

    for l in labels:
        X = y = np.ones(len(l))
        n_iter = 6
        test_size = 1. / 3
        slo = LabelShuffleSplit(n_iter, test_size=test_size, random_state=0)

        # Make sure the repr works
        repr(slo)

        # Test that the length is correct
        assert_equal(slo.get_n_splits(X, y, labels=l), n_iter)

        l_unique = np.unique(l)

        for train, test in slo.split(X, y, labels=l):
            # First test: no train label is in the test set and vice versa
            l_train_unique = np.unique(l[train])
            l_test_unique = np.unique(l[test])
            assert_false(np.any(np.in1d(l[train], l_test_unique)))
            assert_false(np.any(np.in1d(l[test], l_train_unique)))

            # Second test: train and test add up to all the data
            assert_equal(l[train].size + l[test].size, l.size)

            # Third test: train and test are disjoint
            assert_array_equal(np.intersect1d(train, test), [])

            # Fourth test:
            # unique train and test labels are correct, +- 1 for rounding error
            assert_true(
                abs(len(l_test_unique) -
                    round(test_size * len(l_unique))) <= 1)
            assert_true(
                abs(
                    len(l_train_unique) -
                    round((1.0 - test_size) * len(l_unique))) <= 1)
def test_label_shuffle_split():
    labels = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
              np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
              np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
              np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4])]

    for l in labels:
        X = y = np.ones(len(l))
        n_iter = 6
        test_size = 1./3
        slo = LabelShuffleSplit(n_iter, test_size=test_size, random_state=0)

        # Make sure the repr works
        repr(slo)

        # Test that the length is correct
        assert_equal(slo.get_n_splits(X, y, labels=l), n_iter)

        l_unique = np.unique(l)

        for train, test in slo.split(X, y, labels=l):
            # First test: no train label is in the test set and vice versa
            l_train_unique = np.unique(l[train])
            l_test_unique = np.unique(l[test])
            assert_false(np.any(np.in1d(l[train], l_test_unique)))
            assert_false(np.any(np.in1d(l[test], l_train_unique)))

            # Second test: train and test add up to all the data
            assert_equal(l[train].size + l[test].size, l.size)

            # Third test: train and test are disjoint
            assert_array_equal(np.intersect1d(train, test), [])

            # Fourth test:
            # unique train and test labels are correct, +- 1 for rounding error
            assert_true(abs(len(l_test_unique) -
                            round(test_size * len(l_unique))) <= 1)
            assert_true(abs(len(l_train_unique) -
                            round((1.0 - test_size) * len(l_unique))) <= 1)