Пример #1
0
def test_early_stopping():
    svc = SVC(gamma='scale', probability=True)
    st = SelfTrainingClassifier(svc)
    X_train_easy = [[1], [0], [1], [0.5]]
    y_train_easy = [1, 0, -1, -1]
    # X = [[0.5]] cannot be predicted on with a high confidence, so training
    # stops early
    st.fit(X_train_easy, y_train_easy)
    assert st.n_iter_ == 1
    assert st.termination_condition_ == 'no_change'
Пример #2
0
def test_verbose(capsys, verbose):
    clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose)
    clf.fit(X_train, y_train_missing_labels)

    captured = capsys.readouterr()

    if verbose:
        assert 'iteration' in captured.out
    else:
        assert 'iteration' not in captured.out
Пример #3
0
def test_none_iter():
    # Check that the all samples were labeled after a 'reasonable' number of
    # iterations.
    st = SelfTrainingClassifier(KNeighborsClassifier(),
                                threshold=.55,
                                max_iter=None)
    st.fit(X_train, y_train_missing_labels)

    assert st.n_iter_ < 10
    assert st.termination_condition_ == "all_labeled"
def test_labeled_iter(max_iter):
    # Check that the amount of datapoints labeled in iteration 0 is equal to
    # the amount of labeled datapoints we passed.
    st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter)

    st.fit(X_train, y_train_missing_labels)
    amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0])
    assert amount_iter_0 == n_labeled_samples
    # Check that the max of the iterations is less than the total amount of
    # iterations
    assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
def test_invalid_params(max_iter, threshold):
    # Test negative iterations
    base_estimator = SVC(gamma="scale", probability=True)
    st = SelfTrainingClassifier(base_estimator, max_iter=max_iter)
    with pytest.raises(ValueError, match="max_iter must be >= 0 or None"):
        st.fit(X_train, y_train)

    base_estimator = SVC(gamma="scale", probability=True)
    st = SelfTrainingClassifier(base_estimator, threshold=threshold)
    with pytest.raises(ValueError, match="threshold must be in"):
        st.fit(X_train, y_train)
def test_no_unlabeled():
    # Test that training on a fully labeled dataset produces the same results
    # as training the classifier by itself.
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    st = SelfTrainingClassifier(knn)
    with pytest.warns(UserWarning, match="y contains no unlabeled samples"):
        st.fit(X_train, y_train)
    assert_array_equal(knn.predict(X_test), st.predict(X_test))
    # Assert that all samples were labeled in iteration 0 (since there were no
    # unlabeled samples).
    assert np.all(st.labeled_iter_ == 0)
    assert st.termination_condition_ == "all_labeled"
Пример #7
0
def test_sanity_classification():
    base_estimator = SVC(gamma="scale", probability=True)
    base_estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:])

    st = SelfTrainingClassifier(base_estimator)
    st.fit(X_train, y_train_missing_labels)

    pred1, pred2 = base_estimator.predict(X_test), st.predict(X_test)
    assert not np.array_equal(pred1, pred2)
    score_supervised = accuracy_score(base_estimator.predict(X_test), y_test)
    score_self_training = accuracy_score(st.predict(X_test), y_test)

    assert score_self_training > score_supervised
Пример #8
0
def test_zero_iterations(base_estimator, y):
    # Check classification for zero iterations.
    # Fitting a SelfTrainingClassifier with zero iterations should give the
    # same results as fitting a supervised classifier.
    # This also asserts that string arrays work as expected.

    clf1 = SelfTrainingClassifier(base_estimator, max_iter=0)

    clf1.fit(X_train, y)

    clf2 = base_estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples])

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
    assert clf1.termination_condition_ == "max_iter"
def test_k_best_selects_best():
    # Tests that the labels added by st really are the 10 best labels.
    svc = SVC(gamma="scale", probability=True, random_state=0)
    st = SelfTrainingClassifier(svc, criterion="k_best", max_iter=1, k_best=10)
    has_label = y_train_missing_labels != -1
    st.fit(X_train, y_train_missing_labels)

    got_label = ~has_label & (st.transduction_ != -1)

    svc.fit(X_train[has_label], y_train_missing_labels[has_label])
    pred = svc.predict_proba(X_train[~has_label])
    max_proba = np.max(pred, axis=1)

    most_confident_svc = X_train[~has_label][np.argsort(max_proba)[-10:]]
    added_by_st = X_train[np.where(got_label)].tolist()

    for row in most_confident_svc.tolist():
        assert row in added_by_st
Пример #10
0
    def plot_varying_threshold(self, base_classifier, X_train, y_train):
        """
        Plot the effect of varying threshold for self-training

        Parameters
        ___________
        base_classifier: Supervised classifier implementing both fit and predict_proba
        X_train: Scaled feature matrix of the training set
        y_train: Class label of the training set

        Returns
        _____________
        Matplotlib figure
        """
        total_samples  = y_train.shape[0]
        x_values = np.arange(0.4, 1.05, 0.05)
        x_values = np.append(x_values, 0.99999)
        no_labeled = np.zeros(x_values.shape[0])
        no_iterations = np.zeros(x_values.shape[0])

        for (i, threshold) in enumerate(x_values):

            # Fit model with chosen base classifier
            self_training_clf = SelfTrainingClassifier(base_classifier,threshold=threshold)
            self_training_clf.fit(X_train, y_train)

            # The number of labeled samples that the classifier has available by the end of fit
            no_labeled[i] = total_samples - \
                np.unique(self_training_clf.labeled_iter_, return_counts=True)[1][0]

            # The last iteration the classifier labeled a sample in
            no_iterations[i] = np.max(self_training_clf.labeled_iter_)

        # Plot figures
        plt.rcParams.update({'font.size': 15})
        fig, (ax1, ax2) = plt.subplots(1,2, figsize = (15,4))

        ax1.plot(x_values, no_labeled, color='b')
        ax1.set_xlabel('Threshold')
        ax1.set_ylabel('Number of labeled samples')
        ax2.plot(x_values, no_iterations, color='b')
        ax2.set_ylabel('Number of iterations')
        ax2.set_xlabel('Threshold')
        plt.show()
def test_k_best():
    st = SelfTrainingClassifier(KNeighborsClassifier(n_neighbors=1),
                                criterion='k_best',
                                k_best=10,
                                max_iter=None)
    y_train_only_one_label = np.copy(y_train)
    y_train_only_one_label[1:] = -1
    n_samples = y_train.shape[0]

    n_expected_iter = ceil((n_samples - 1) / 10)
    st.fit(X_train, y_train_only_one_label)
    assert st.n_iter_ == n_expected_iter

    # Check labeled_iter_
    assert np.sum(st.labeled_iter_ == 0) == 1
    for i in range(1, n_expected_iter):
        assert np.sum(st.labeled_iter_ == i) == 10
    assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10
    assert st.termination_condition_ == 'all_labeled'
def test_classification(base_estimator, selection_crit):
    # Check classification for various parameter settings.
    # Also assert that predictions for strings and numerical labels are equal.
    # Also test for multioutput classification
    threshold = 0.75
    max_iter = 10
    st = SelfTrainingClassifier(base_estimator,
                                max_iter=max_iter,
                                threshold=threshold,
                                criterion=selection_crit)
    st.fit(X_train, y_train_missing_labels)
    pred = st.predict(X_test)
    proba = st.predict_proba(X_test)

    st_string = SelfTrainingClassifier(base_estimator,
                                       max_iter=max_iter,
                                       criterion=selection_crit,
                                       threshold=threshold)
    st_string.fit(X_train, y_train_missing_strings)
    pred_string = st_string.predict(X_test)
    proba_string = st_string.predict_proba(X_test)

    assert_array_equal(np.vectorize(mapping.get)(pred), pred_string)
    assert_array_equal(proba, proba_string)

    assert st.termination_condition_ == st_string.termination_condition_
    # Check consistency between labeled_iter, n_iter and max_iter
    labeled = y_train_missing_labels != -1
    # assert that labeled samples have labeled_iter = 0
    assert_array_equal(st.labeled_iter_ == 0, labeled)
    # assert that labeled samples do not change label during training
    assert_array_equal(y_train_missing_labels[labeled],
                       st.transduction_[labeled])

    # assert that the max of the iterations is less than the total amount of
    # iterations
    assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
    assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter

    # check shapes
    assert st.labeled_iter_.shape == st.transduction_.shape
    assert st_string.labeled_iter_.shape == st_string.transduction_.shape
def test_verbose_k_best(capsys):
    st = SelfTrainingClassifier(KNeighborsClassifier(n_neighbors=1),
                                criterion='k_best',
                                k_best=10,
                                verbose=True,
                                max_iter=None)

    y_train_only_one_label = np.copy(y_train)
    y_train_only_one_label[1:] = -1
    n_samples = y_train.shape[0]

    n_expected_iter = ceil((n_samples - 1) / 10)
    st.fit(X_train, y_train_only_one_label)

    captured = capsys.readouterr()

    msg = 'End of iteration {}, added {} new labels.'
    for i in range(1, n_expected_iter):
        assert msg.format(i, 10) in captured.out

    assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out
def test_base_estimator_meta_estimator():
    # Check that a meta-estimator relying on an estimator implementing
    # `predict_proba` will work even if it does expose this method before being
    # fitted.
    # Non-regression test for:
    # https://github.com/scikit-learn/scikit-learn/issues/19119

    base_estimator = StackingClassifier(estimators=[
        ("svc_1", SVC(probability=True)),
        ("svc_2", SVC(probability=True)),
    ],
                                        final_estimator=SVC(probability=True),
                                        cv=2)

    # make sure that the `base_estimator` does not expose `predict_proba`
    # without being fitted
    assert not hasattr(base_estimator, "predict_proba")

    clf = SelfTrainingClassifier(base_estimator=base_estimator)
    clf.fit(X_train, y_train_missing_labels)
    clf.predict_proba(X_test)
def test_invalid_params_selection_crit():
    st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="foo")

    with pytest.raises(ValueError, match="criterion must be either"):
        st.fit(X_train, y_train)
Пример #16
0
def test_warns_k_best():
    st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="k_best", k_best=1000)
    with pytest.warns(UserWarning, match="k_best is larger than"):
        st.fit(X_train, y_train_missing_labels)

    assert st.termination_condition_ == "all_labeled"
def test_none_classifier():
    st = SelfTrainingClassifier(None)
    with pytest.raises(ValueError, match="base_estimator cannot be None"):
        st.fit(X_train, y_train_missing_labels)
# will behave as a
# semi-supervised classifier, allowing it to learn from unlabeled data.
# Read more in the :ref:`User guide <self_training>`.

import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC

rng = np.random.RandomState(42)
iris = datasets.load_iris()
random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3
iris.target[random_unlabeled_points] = -1
svc = SVC(probability=True, gamma="auto")
self_training_model = SelfTrainingClassifier(svc)
self_training_model.fit(iris.data, iris.target)

##############################################################################
# New SequentialFeatureSelector transformer
# -----------------------------------------
# A new iterative transformer to select features is available:
# :class:`~sklearn.feature_selection.SequentialFeatureSelector`.
# Sequential Feature Selection can add features one at a time (forward
# selection) or remove features from the list of the available features
# (backward selection), based on a cross-validated score maximization.
# See the :ref:`User Guide <sequential_feature_selection>`.

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
Пример #19
0
for (i, threshold) in enumerate(x_values):
    self_training_clf = SelfTrainingClassifier(base_classifier,
                                               threshold=threshold)

    # We need manual cross validation so that we don't treat -1 as a separate
    # class when computing accuracy
    skfolds = StratifiedKFold(n_splits=n_splits)
    for fold, (train_index, test_index) in enumerate(skfolds.split(X, y)):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        y_test_true = y_true[test_index]

        self_training_clf.fit(X_train, y_train)

        # The amount of labeled samples that at the end of fitting
        amount_labeled[i, fold] = (total_samples - np.unique(
            self_training_clf.labeled_iter_, return_counts=True)[1][0])
        # The last iteration the classifier labeled a sample in
        amount_iterations[i, fold] = np.max(self_training_clf.labeled_iter_)

        y_pred = self_training_clf.predict(X_test)
        scores[i, fold] = accuracy_score(y_test_true, y_pred)

ax1 = plt.subplot(211)
ax1.errorbar(x_values,
             scores.mean(axis=1),
             yerr=scores.std(axis=1),
             capsize=2,