def test_prefitted_throws_error(): # Test that passing a pre-fitted classifier and calling predict throws an # error knn = KNeighborsClassifier() knn.fit(X_train, y_train) st = SelfTrainingClassifier(knn) with pytest.raises(NotFittedError, match="This SelfTrainingClassifier" " instance is not fitted yet"): st.predict(X_train)
def test_sanity_classification(): base_estimator = SVC(gamma="scale", probability=True) base_estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:]) st = SelfTrainingClassifier(base_estimator) st.fit(X_train, y_train_missing_labels) pred1, pred2 = base_estimator.predict(X_test), st.predict(X_test) assert not np.array_equal(pred1, pred2) score_supervised = accuracy_score(base_estimator.predict(X_test), y_test) score_self_training = accuracy_score(st.predict(X_test), y_test) assert score_self_training > score_supervised
def self_training_clf(self, base_classifier, X_train, y_train, threshold= None, max_iter = None,verbose = None): """ Train self-training classifier from scikit-learn >= 0.24.1 Parameters ___________ base_classifier: Supervised classifier implementing both fit and predict_proba X_train: Scaled feature matrix of the training set y_train: Class label of the training set threshold (float): The decision threshold for use with criterion='threshold'. Should be in [0, 1) max_iter (int): Maximum number of iterations allowed. Should be greater than or equal to 0 verbose (bool): Enable verbose output Returns _____________ Predicted labels and probability """ # Self training model model = SelfTrainingClassifier(base_classifier,threshold= threshold, max_iter = max_iter, verbose = verbose) # Fit the training set model.fit(X_train, y_train) # Predict the labels of the unlabeled data points predicted_labels = model.predict(X_train) # Predict probability predicted_proba = model.predict_proba(X_train) return predicted_labels, predicted_proba
def test_classification(base_estimator, selection_crit): # Check classification for various parameter settings. # Also assert that predictions for strings and numerical labels are equal. # Also test for multioutput classification threshold = 0.75 max_iter = 10 st = SelfTrainingClassifier(base_estimator, max_iter=max_iter, threshold=threshold, criterion=selection_crit) st.fit(X_train, y_train_missing_labels) pred = st.predict(X_test) proba = st.predict_proba(X_test) st_string = SelfTrainingClassifier(base_estimator, max_iter=max_iter, criterion=selection_crit, threshold=threshold) st_string.fit(X_train, y_train_missing_strings) pred_string = st_string.predict(X_test) proba_string = st_string.predict_proba(X_test) assert_array_equal(np.vectorize(mapping.get)(pred), pred_string) assert_array_equal(proba, proba_string) assert st.termination_condition_ == st_string.termination_condition_ # Check consistency between labeled_iter, n_iter and max_iter labeled = y_train_missing_labels != -1 # assert that labeled samples have labeled_iter = 0 assert_array_equal(st.labeled_iter_ == 0, labeled) # assert that labeled samples do not change label during training assert_array_equal(y_train_missing_labels[labeled], st.transduction_[labeled]) # assert that the max of the iterations is less than the total amount of # iterations assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter # check shapes assert st.labeled_iter_.shape == st.transduction_.shape assert st_string.labeled_iter_.shape == st_string.transduction_.shape
def test_no_unlabeled(): # Test that training on a fully labeled dataset produces the same results # as training the classifier by itself. knn = KNeighborsClassifier() knn.fit(X_train, y_train) st = SelfTrainingClassifier(knn) with pytest.warns(UserWarning, match="y contains no unlabeled samples"): st.fit(X_train, y_train) assert_array_equal(knn.predict(X_test), st.predict(X_test)) # Assert that all samples were labeled in iteration 0 (since there were no # unlabeled samples). assert np.all(st.labeled_iter_ == 0) assert st.termination_condition_ == "all_labeled"
def test_zero_iterations(base_estimator, y): # Check classification for zero iterations. # Fitting a SelfTrainingClassifier with zero iterations should give the # same results as fitting a supervised classifier. # This also asserts that string arrays work as expected. clf1 = SelfTrainingClassifier(base_estimator, max_iter=0) clf1.fit(X_train, y) clf2 = base_estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples]) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) assert clf1.termination_condition_ == "max_iter"
for fold, (train_index, test_index) in enumerate(skfolds.split(X, y)): X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] y_test_true = y_true[test_index] self_training_clf.fit(X_train, y_train) # The amount of labeled samples that at the end of fitting amount_labeled[i, fold] = (total_samples - np.unique( self_training_clf.labeled_iter_, return_counts=True)[1][0]) # The last iteration the classifier labeled a sample in amount_iterations[i, fold] = np.max(self_training_clf.labeled_iter_) y_pred = self_training_clf.predict(X_test) scores[i, fold] = accuracy_score(y_test_true, y_pred) ax1 = plt.subplot(211) ax1.errorbar(x_values, scores.mean(axis=1), yerr=scores.std(axis=1), capsize=2, color="b") ax1.set_ylabel("Accuracy", color="b") ax1.tick_params("y", colors="b") ax2 = ax1.twinx() ax2.errorbar( x_values, amount_labeled.mean(axis=1),