def test_missing_predict_proba(): # Check that an error is thrown if predict_proba is not implemented base_estimator = SVC(probability=False, gamma="scale") self_training = SelfTrainingClassifier(base_estimator) with pytest.raises(AttributeError, match="predict_proba is not available"): self_training.fit(X_train, y_train_missing_labels)
def test_missing_predict_proba(): # Check that an error is thrown if predict_proba is not implemented base_estimator = SVC(probability=False, gamma='scale') self_training = SelfTrainingClassifier(base_estimator) with pytest.raises(ValueError, match=r"base_estimator \(SVC\) should"): self_training.fit(X_train, y_train_missing_labels)
def test_base_estimator_meta_estimator(): # Check that a meta-estimator relying on an estimator implementing # `predict_proba` will work even if it does expose this method before being # fitted. # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/19119 base_estimator = StackingClassifier( estimators=[ ("svc_1", SVC(probability=True)), ("svc_2", SVC(probability=True)), ], final_estimator=SVC(probability=True), cv=2, ) assert hasattr(base_estimator, "predict_proba") clf = SelfTrainingClassifier(base_estimator=base_estimator) clf.fit(X_train, y_train_missing_labels) clf.predict_proba(X_test) base_estimator = StackingClassifier( estimators=[ ("svc_1", SVC(probability=False)), ("svc_2", SVC(probability=False)), ], final_estimator=SVC(probability=False), cv=2, ) assert not hasattr(base_estimator, "predict_proba") clf = SelfTrainingClassifier(base_estimator=base_estimator) with pytest.raises(AttributeError): clf.fit(X_train, y_train_missing_labels)
def test_warns_k_best(): st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="k_best", k_best=1000) with pytest.warns(UserWarning, match="k_best is larger than"): st.fit(X_train, y_train_missing_labels) assert st.termination_condition_ == "all_labeled"
def test_none_iter(): # Check that the all samples were labeled after a 'reasonable' number of # iterations. st = SelfTrainingClassifier(KNeighborsClassifier(), threshold=0.55, max_iter=None) st.fit(X_train, y_train_missing_labels) assert st.n_iter_ < 10 assert st.termination_condition_ == "all_labeled"
def test_strings_dtype(): clf = SelfTrainingClassifier(KNeighborsClassifier()) X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) labels_multiclass = ["one", "two", "three"] y_strings = np.take(labels_multiclass, y) with pytest.raises(ValueError, match="dtype"): clf.fit(X, y_strings)
def test_early_stopping(): svc = SVC(gamma="scale", probability=True) st = SelfTrainingClassifier(svc) X_train_easy = [[1], [0], [1], [0.5]] y_train_easy = [1, 0, -1, -1] # X = [[0.5]] cannot be predicted on with a high confidence, so training # stops early st.fit(X_train_easy, y_train_easy) assert st.n_iter_ == 1 assert st.termination_condition_ == "no_change"
def test_prefitted_throws_error(): # Test that passing a pre-fitted classifier and calling predict throws an # error knn = KNeighborsClassifier() knn.fit(X_train, y_train) st = SelfTrainingClassifier(knn) with pytest.raises(NotFittedError, match="This SelfTrainingClassifier" " instance is not fitted yet"): st.predict(X_train)
def test_verbose(capsys, verbose): clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose) clf.fit(X_train, y_train_missing_labels) captured = capsys.readouterr() if verbose: assert "iteration" in captured.out else: assert "iteration" not in captured.out
def test_labeled_iter(max_iter): # Check that the amount of datapoints labeled in iteration 0 is equal to # the amount of labeled datapoints we passed. st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter) st.fit(X_train, y_train_missing_labels) amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0]) assert amount_iter_0 == n_labeled_samples # Check that the max of the iterations is less than the total amount of # iterations assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter
def test_no_unlabeled(): # Test that training on a fully labeled dataset produces the same results # as training the classifier by itself. knn = KNeighborsClassifier() knn.fit(X_train, y_train) st = SelfTrainingClassifier(knn) with pytest.warns(UserWarning, match="y contains no unlabeled samples"): st.fit(X_train, y_train) assert_array_equal(knn.predict(X_test), st.predict(X_test)) # Assert that all samples were labeled in iteration 0 (since there were no # unlabeled samples). assert np.all(st.labeled_iter_ == 0) assert st.termination_condition_ == "all_labeled"
def test_zero_iterations(base_estimator, y): # Check classification for zero iterations. # Fitting a SelfTrainingClassifier with zero iterations should give the # same results as fitting a supervised classifier. # This also asserts that string arrays work as expected. clf1 = SelfTrainingClassifier(base_estimator, max_iter=0) clf1.fit(X_train, y) clf2 = base_estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples]) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) assert clf1.termination_condition_ == "max_iter"
def test_k_best_selects_best(): # Tests that the labels added by st really are the 10 best labels. svc = SVC(gamma="scale", probability=True, random_state=0) st = SelfTrainingClassifier(svc, criterion="k_best", max_iter=1, k_best=10) has_label = y_train_missing_labels != -1 st.fit(X_train, y_train_missing_labels) got_label = ~has_label & (st.transduction_ != -1) svc.fit(X_train[has_label], y_train_missing_labels[has_label]) pred = svc.predict_proba(X_train[~has_label]) max_proba = np.max(pred, axis=1) most_confident_svc = X_train[~has_label][np.argsort(max_proba)[-10:]] added_by_st = X_train[np.where(got_label)].tolist() for row in most_confident_svc.tolist(): assert row in added_by_st
def plot_varying_threshold(self, base_classifier, X_train, y_train): """ Plot the effect of varying threshold for self-training Parameters ___________ base_classifier: Supervised classifier implementing both fit and predict_proba X_train: Scaled feature matrix of the training set y_train: Class label of the training set Returns _____________ Matplotlib figure """ total_samples = y_train.shape[0] x_values = np.arange(0.4, 1.05, 0.05) x_values = np.append(x_values, 0.99999) no_labeled = np.zeros(x_values.shape[0]) no_iterations = np.zeros(x_values.shape[0]) for (i, threshold) in enumerate(x_values): # Fit model with chosen base classifier self_training_clf = SelfTrainingClassifier(base_classifier,threshold=threshold) self_training_clf.fit(X_train, y_train) # The number of labeled samples that the classifier has available by the end of fit no_labeled[i] = total_samples - \ np.unique(self_training_clf.labeled_iter_, return_counts=True)[1][0] # The last iteration the classifier labeled a sample in no_iterations[i] = np.max(self_training_clf.labeled_iter_) # Plot figures plt.rcParams.update({'font.size': 15}) fig, (ax1, ax2) = plt.subplots(1,2, figsize = (15,4)) ax1.plot(x_values, no_labeled, color='b') ax1.set_xlabel('Threshold') ax1.set_ylabel('Number of labeled samples') ax2.plot(x_values, no_iterations, color='b') ax2.set_ylabel('Number of iterations') ax2.set_xlabel('Threshold') plt.show()
def test_invalid_params(max_iter, threshold): # Test negative iterations base_estimator = SVC(gamma="scale", probability=True) st = SelfTrainingClassifier(base_estimator, max_iter=max_iter) with pytest.raises(ValueError, match="max_iter must be >= 0 or None"): st.fit(X_train, y_train) base_estimator = SVC(gamma="scale", probability=True) st = SelfTrainingClassifier(base_estimator, threshold=threshold) with pytest.raises(ValueError, match="threshold must be in"): st.fit(X_train, y_train)
def test_k_best(): st = SelfTrainingClassifier(KNeighborsClassifier(n_neighbors=1), criterion='k_best', k_best=10, max_iter=None) y_train_only_one_label = np.copy(y_train) y_train_only_one_label[1:] = -1 n_samples = y_train.shape[0] n_expected_iter = ceil((n_samples - 1) / 10) st.fit(X_train, y_train_only_one_label) assert st.n_iter_ == n_expected_iter # Check labeled_iter_ assert np.sum(st.labeled_iter_ == 0) == 1 for i in range(1, n_expected_iter): assert np.sum(st.labeled_iter_ == i) == 10 assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10 assert st.termination_condition_ == 'all_labeled'
def test_verbose_k_best(capsys): st = SelfTrainingClassifier(KNeighborsClassifier(n_neighbors=1), criterion='k_best', k_best=10, verbose=True, max_iter=None) y_train_only_one_label = np.copy(y_train) y_train_only_one_label[1:] = -1 n_samples = y_train.shape[0] n_expected_iter = ceil((n_samples - 1) / 10) st.fit(X_train, y_train_only_one_label) captured = capsys.readouterr() msg = 'End of iteration {}, added {} new labels.' for i in range(1, n_expected_iter): assert msg.format(i, 10) in captured.out assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out
def self_training_clf(self, base_classifier, X_train, y_train, threshold= None, max_iter = None,verbose = None): """ Train self-training classifier from scikit-learn >= 0.24.1 Parameters ___________ base_classifier: Supervised classifier implementing both fit and predict_proba X_train: Scaled feature matrix of the training set y_train: Class label of the training set threshold (float): The decision threshold for use with criterion='threshold'. Should be in [0, 1) max_iter (int): Maximum number of iterations allowed. Should be greater than or equal to 0 verbose (bool): Enable verbose output Returns _____________ Predicted labels and probability """ # Self training model model = SelfTrainingClassifier(base_classifier,threshold= threshold, max_iter = max_iter, verbose = verbose) # Fit the training set model.fit(X_train, y_train) # Predict the labels of the unlabeled data points predicted_labels = model.predict(X_train) # Predict probability predicted_proba = model.predict_proba(X_train) return predicted_labels, predicted_proba
def test_sanity_classification(): base_estimator = SVC(gamma="scale", probability=True) base_estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:]) st = SelfTrainingClassifier(base_estimator) st.fit(X_train, y_train_missing_labels) pred1, pred2 = base_estimator.predict(X_test), st.predict(X_test) assert not np.array_equal(pred1, pred2) score_supervised = accuracy_score(base_estimator.predict(X_test), y_test) score_self_training = accuracy_score(st.predict(X_test), y_test) assert score_self_training > score_supervised
def self_training(x_train_all, y_train_all, cv_semisupervised, base_model, name="SelfTrainingClassifier", k_best=100, max_iter=None, only_model=False, **kwargs): """ Self training - a semisupervised model. Parameters: x_train_all (pd.DataFrame): contains both the features of labelled and unlabelled data. y_train_all (pd.Series): contains the labels of the labelled and unlabelled data. Unlabelled data must have label -1. cv_semisupervised (list): List of training and testing tuples which contain the indiced for the different folds. base_model (model): model that has a fit function! name (str): Name/Description for the model. only_model (bool): if True returns only the model Returns: dict: results from cross validation, inclusive probability based crossvalidation """ # TODO cv: use the same cv split but randomly assign the other unlabelled data pieces to the other cv folds st_model = SelfTrainingClassifier(base_model, verbose=True, max_iter=max_iter, k_best=k_best).fit( x_train_all, y_train_all) # predict_proba possible #y_pred = st_model.predict(x_train) if only_model: return st_model return calculate_metrics_cv(model=st_model, X=x_train_all, y_true=y_train_all, cv=cv_semisupervised, name=name)
skip_methods=["transform", "inverse_transform"]), DelegatorData( "BaggingClassifier", BaggingClassifier, skip_methods=[ "transform", "inverse_transform", "score", "predict_proba", "predict_log_proba", "predict", ], ), DelegatorData( "SelfTrainingClassifier", lambda est: SelfTrainingClassifier(est), skip_methods=["transform", "inverse_transform", "predict_proba"], ), ] def test_metaestimator_delegation(): # Ensures specified metaestimators have methods iff subestimator does def hides(method): @property def wrapper(obj): if obj.hidden_method == method.__name__: raise AttributeError("%r is hidden" % obj.hidden_method) return functools.partial(method, obj) return wrapper
# classifier that implements :term:`predict_proba`. The sub-classifier # will behave as a # semi-supervised classifier, allowing it to learn from unlabeled data. # Read more in the :ref:`User guide <self_training>`. import numpy as np from sklearn import datasets from sklearn.semi_supervised import SelfTrainingClassifier from sklearn.svm import SVC rng = np.random.RandomState(42) iris = datasets.load_iris() random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3 iris.target[random_unlabeled_points] = -1 svc = SVC(probability=True, gamma="auto") self_training_model = SelfTrainingClassifier(svc) self_training_model.fit(iris.data, iris.target) ############################################################################## # New SequentialFeatureSelector transformer # ----------------------------------------- # A new iterative transformer to select features is available: # :class:`~sklearn.feature_selection.SequentialFeatureSelector`. # Sequential Feature Selection can add features one at a time (forward # selection) or remove features from the list of the available features # (backward selection), based on a cross-validated score maximization. # See the :ref:`User Guide <sequential_feature_selection>`. from sklearn.feature_selection import SequentialFeatureSelector from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_iris
rng = np.random.RandomState(0) y_rand = rng.rand(y.shape[0]) y_30 = np.copy(y) y_30[y_rand < 0.3] = -1 # set random samples to be unlabeled y_50 = np.copy(y) y_50[y_rand < 0.5] = -1 # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors ls30 = (LabelSpreading().fit(X, y_30), y_30, "Label Spreading 30% data") ls50 = (LabelSpreading().fit(X, y_50), y_50, "Label Spreading 50% data") ls100 = (LabelSpreading().fit(X, y), y, "Label Spreading 100% data") # the base classifier for self-training is identical to the SVC base_classifier = SVC(kernel="rbf", gamma=0.5, probability=True) st30 = ( SelfTrainingClassifier(base_classifier).fit(X, y_30), y_30, "Self-training 30% data", ) st50 = ( SelfTrainingClassifier(base_classifier).fit(X, y_50), y_50, "Self-training 50% data", ) rbf_svc = (SVC(kernel="rbf", gamma=0.5).fit(X, y), y, "SVC with rbf kernel") # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
SelfTrainingClassifier ) from sklearn.svm import SVC from main import plot_decision_boundary, get_data if __name__ == '__main__': X_train, X_test, y_train, y_test = get_data() X = np.concatenate([X_train, X_test], axis=0) y = np.concatenate([y_train, -1 * np.ones_like(y_test)], axis=0) models = ( LabelPropagation(max_iter=10000), LabelSpreading(), SelfTrainingClassifier(base_estimator=SVC(probability=True, gamma="auto")) ) color_maps = ('Blues', 'Greens', 'Reds') for model, cmap in zip(models, color_maps): model.fit(X, y) y_pred = model.predict(X[y == -1]) sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap=cmap) print('-'*50, f'\nModel name: {model.__str__()}\n' f'Accuracy_score: {accuracy_score(y_test, y_pred)}') plt.show() plot_decision_boundary(X, y, y_test, y_pred, model)
X, y = datasets.load_breast_cancer(return_X_y=True) X, y = shuffle(X, y, random_state=42) y_true = y.copy() y[50:] = -1 total_samples = y.shape[0] base_classifier = SVC(probability=True, gamma=0.001, random_state=42) x_values = np.arange(0.4, 1.05, 0.05) x_values = np.append(x_values, 0.99999) scores = np.empty((x_values.shape[0], n_splits)) amount_labeled = np.empty((x_values.shape[0], n_splits)) amount_iterations = np.empty((x_values.shape[0], n_splits)) for (i, threshold) in enumerate(x_values): self_training_clf = SelfTrainingClassifier(base_classifier, threshold=threshold) # We need manual cross validation so that we don't treat -1 as a separate # class when computing accuracy skfolds = StratifiedKFold(n_splits=n_splits) for fold, (train_index, test_index) in enumerate(skfolds.split(X, y)): X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] y_test_true = y_true[test_index] self_training_clf.fit(X_train, y_train) # The amount of labeled samples that at the end of fitting amount_labeled[i, fold] = (total_samples - np.unique(
rng = np.random.RandomState(0) y_rand = rng.rand(y.shape[0]) y_30 = np.copy(y) y_30[y_rand < 0.3] = -1 # set random samples to be unlabeled y_50 = np.copy(y) y_50[y_rand < 0.5] = -1 # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors ls30 = (LabelSpreading().fit(X, y_30), y_30, 'Label Spreading 30% data') ls50 = (LabelSpreading().fit(X, y_50), y_50, 'Label Spreading 50% data') ls100 = (LabelSpreading().fit(X, y), y, 'Label Spreading 100% data') # the base classifier for self-training is identical to the SVC base_classifier = SVC(kernel='rbf', gamma=.5, probability=True) st30 = (SelfTrainingClassifier(base_classifier).fit(X, y_30), y_30, 'Self-training 30% data') st50 = (SelfTrainingClassifier(base_classifier).fit(X, y_50), y_50, 'Self-training 50% data') rbf_svc = (SVC(kernel='rbf', gamma=.5).fit(X, y), y, 'SVC with rbf kernel') # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)} classifiers = (ls30, st30, ls50, st50, ls100, rbf_svc) for i, (clf, y_train, title) in enumerate(classifiers):
def test_classification(base_estimator, selection_crit): # Check classification for various parameter settings. # Also assert that predictions for strings and numerical labels are equal. # Also test for multioutput classification threshold = 0.75 max_iter = 10 st = SelfTrainingClassifier(base_estimator, max_iter=max_iter, threshold=threshold, criterion=selection_crit) st.fit(X_train, y_train_missing_labels) pred = st.predict(X_test) proba = st.predict_proba(X_test) st_string = SelfTrainingClassifier(base_estimator, max_iter=max_iter, criterion=selection_crit, threshold=threshold) st_string.fit(X_train, y_train_missing_strings) pred_string = st_string.predict(X_test) proba_string = st_string.predict_proba(X_test) assert_array_equal(np.vectorize(mapping.get)(pred), pred_string) assert_array_equal(proba, proba_string) assert st.termination_condition_ == st_string.termination_condition_ # Check consistency between labeled_iter, n_iter and max_iter labeled = y_train_missing_labels != -1 # assert that labeled samples have labeled_iter = 0 assert_array_equal(st.labeled_iter_ == 0, labeled) # assert that labeled samples do not change label during training assert_array_equal(y_train_missing_labels[labeled], st.transduction_[labeled]) # assert that the max of the iterations is less than the total amount of # iterations assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter # check shapes assert st.labeled_iter_.shape == st.transduction_.shape assert st_string.labeled_iter_.shape == st_string.transduction_.shape
# Parameters sdg_params = dict(alpha=1e-5, penalty='l2', loss='log') vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8) # Supervised Pipeline pipeline = Pipeline([ ('vect', CountVectorizer(**vectorizer_params)), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(**sdg_params)), ]) # SelfTraining Pipeline st_pipeline = Pipeline([ ('vect', CountVectorizer(**vectorizer_params)), ('tfidf', TfidfTransformer()), ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)), ]) # LabelSpreading Pipeline ls_pipeline = Pipeline([ ('vect', CountVectorizer(**vectorizer_params)), ('tfidf', TfidfTransformer()), # LabelSpreading does not support dense matrices ('todense', FunctionTransformer(lambda x: x.todense())), ('clf', LabelSpreading()), ]) def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test): print("Number of training samples:", len(X_train)) print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
def test_invalid_params_selection_crit(): st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="foo") with pytest.raises(ValueError, match="criterion must be either"): st.fit(X_train, y_train)
def test_none_classifier(): st = SelfTrainingClassifier(None) with pytest.raises(ValueError, match="base_estimator cannot be None"): st.fit(X_train, y_train_missing_labels)