def test_prefitted_throws_error():
    # Test that passing a pre-fitted classifier and calling predict throws an
    # error
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    st = SelfTrainingClassifier(knn)
    with pytest.raises(NotFittedError,
                       match="This SelfTrainingClassifier"
                       " instance is not fitted yet"):
        st.predict(X_train)
Exemplo n.º 2
0
def test_sanity_classification():
    base_estimator = SVC(gamma="scale", probability=True)
    base_estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:])

    st = SelfTrainingClassifier(base_estimator)
    st.fit(X_train, y_train_missing_labels)

    pred1, pred2 = base_estimator.predict(X_test), st.predict(X_test)
    assert not np.array_equal(pred1, pred2)
    score_supervised = accuracy_score(base_estimator.predict(X_test), y_test)
    score_self_training = accuracy_score(st.predict(X_test), y_test)

    assert score_self_training > score_supervised
Exemplo n.º 3
0
    def self_training_clf(self, base_classifier, X_train, y_train, 
                            threshold= None, max_iter = None,verbose = None):
        """
        Train self-training classifier from scikit-learn >= 0.24.1

        Parameters
        ___________
        base_classifier: Supervised classifier implementing both fit and predict_proba
        X_train: Scaled feature matrix of the training set
        y_train: Class label of the training set
        threshold (float):  The decision threshold for use with criterion='threshold'. Should be in [0, 1)
        max_iter (int):  Maximum number of iterations allowed. Should be greater than or equal to 0
        verbose (bool): Enable verbose output

        Returns
        _____________
        Predicted labels and probability
        """
        # Self training model
        model = SelfTrainingClassifier(base_classifier,threshold= threshold, 
                            max_iter = max_iter, verbose = verbose)

        # Fit the training set
        model.fit(X_train, y_train)

        # Predict the labels of the unlabeled data points
        predicted_labels = model.predict(X_train)

        # Predict probability
        predicted_proba = model.predict_proba(X_train)
        return predicted_labels, predicted_proba
def test_classification(base_estimator, selection_crit):
    # Check classification for various parameter settings.
    # Also assert that predictions for strings and numerical labels are equal.
    # Also test for multioutput classification
    threshold = 0.75
    max_iter = 10
    st = SelfTrainingClassifier(base_estimator,
                                max_iter=max_iter,
                                threshold=threshold,
                                criterion=selection_crit)
    st.fit(X_train, y_train_missing_labels)
    pred = st.predict(X_test)
    proba = st.predict_proba(X_test)

    st_string = SelfTrainingClassifier(base_estimator,
                                       max_iter=max_iter,
                                       criterion=selection_crit,
                                       threshold=threshold)
    st_string.fit(X_train, y_train_missing_strings)
    pred_string = st_string.predict(X_test)
    proba_string = st_string.predict_proba(X_test)

    assert_array_equal(np.vectorize(mapping.get)(pred), pred_string)
    assert_array_equal(proba, proba_string)

    assert st.termination_condition_ == st_string.termination_condition_
    # Check consistency between labeled_iter, n_iter and max_iter
    labeled = y_train_missing_labels != -1
    # assert that labeled samples have labeled_iter = 0
    assert_array_equal(st.labeled_iter_ == 0, labeled)
    # assert that labeled samples do not change label during training
    assert_array_equal(y_train_missing_labels[labeled],
                       st.transduction_[labeled])

    # assert that the max of the iterations is less than the total amount of
    # iterations
    assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
    assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter

    # check shapes
    assert st.labeled_iter_.shape == st.transduction_.shape
    assert st_string.labeled_iter_.shape == st_string.transduction_.shape
def test_no_unlabeled():
    # Test that training on a fully labeled dataset produces the same results
    # as training the classifier by itself.
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    st = SelfTrainingClassifier(knn)
    with pytest.warns(UserWarning, match="y contains no unlabeled samples"):
        st.fit(X_train, y_train)
    assert_array_equal(knn.predict(X_test), st.predict(X_test))
    # Assert that all samples were labeled in iteration 0 (since there were no
    # unlabeled samples).
    assert np.all(st.labeled_iter_ == 0)
    assert st.termination_condition_ == "all_labeled"
Exemplo n.º 6
0
def test_zero_iterations(base_estimator, y):
    # Check classification for zero iterations.
    # Fitting a SelfTrainingClassifier with zero iterations should give the
    # same results as fitting a supervised classifier.
    # This also asserts that string arrays work as expected.

    clf1 = SelfTrainingClassifier(base_estimator, max_iter=0)

    clf1.fit(X_train, y)

    clf2 = base_estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples])

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
    assert clf1.termination_condition_ == "max_iter"
Exemplo n.º 7
0
    for fold, (train_index, test_index) in enumerate(skfolds.split(X, y)):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        y_test_true = y_true[test_index]

        self_training_clf.fit(X_train, y_train)

        # The amount of labeled samples that at the end of fitting
        amount_labeled[i, fold] = (total_samples - np.unique(
            self_training_clf.labeled_iter_, return_counts=True)[1][0])
        # The last iteration the classifier labeled a sample in
        amount_iterations[i, fold] = np.max(self_training_clf.labeled_iter_)

        y_pred = self_training_clf.predict(X_test)
        scores[i, fold] = accuracy_score(y_test_true, y_pred)

ax1 = plt.subplot(211)
ax1.errorbar(x_values,
             scores.mean(axis=1),
             yerr=scores.std(axis=1),
             capsize=2,
             color="b")
ax1.set_ylabel("Accuracy", color="b")
ax1.tick_params("y", colors="b")

ax2 = ax1.twinx()
ax2.errorbar(
    x_values,
    amount_labeled.mean(axis=1),