def test_base(): # Check BaseEnsemble methods. ensemble = BaggingClassifier(base_estimator=Perceptron(random_state=None), n_estimators=3) iris = load_iris() ensemble.fit(iris.data, iris.target) ensemble.estimators_ = [] # empty the list and create estimators manually ensemble._make_estimator() random_state = np.random.RandomState(3) ensemble._make_estimator(random_state=random_state) ensemble._make_estimator(random_state=random_state) ensemble._make_estimator(append=False) assert 3 == len(ensemble) assert 3 == len(ensemble.estimators_) assert isinstance(ensemble[0], Perceptron) assert ensemble[0].random_state is None assert isinstance(ensemble[1].random_state, int) assert isinstance(ensemble[2].random_state, int) assert ensemble[1].random_state != ensemble[2].random_state np_int_ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=np.int32(3)) np_int_ensemble.fit(iris.data, iris.target)
def test_perceptron_correctness(): y_bin = y.copy() y_bin[y != 1] = -1 clf1 = MyPerceptron(n_iter=2) clf1.fit(X, y_bin) clf2 = Perceptron(max_iter=2, shuffle=False, tol=None) clf2.fit(X, y_bin) assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
def test_base_not_int_n_estimators(): # Check that instantiating a BaseEnsemble with a string as n_estimators # raises a ValueError demanding n_estimators to be supplied as an integer. string_ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators='3') iris = load_iris() assert_raise_message(ValueError, "n_estimators must be an integer", string_ensemble.fit, iris.data, iris.target) float_ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=3.0) assert_raise_message(ValueError, "n_estimators must be an integer", float_ensemble.fit, iris.data, iris.target)
def test_set_random_states(): # Linear Discriminant Analysis doesn't have random state: smoke test _set_random_states(LinearDiscriminantAnalysis(), random_state=17) clf1 = Perceptron(random_state=None) assert clf1.random_state is None # check random_state is None still sets _set_random_states(clf1, None) assert isinstance(clf1.random_state, int) # check random_state fixes results in consistent initialisation _set_random_states(clf1, 3) assert isinstance(clf1.random_state, int) clf2 = Perceptron(random_state=None) _set_random_states(clf2, 3) assert clf1.random_state == clf2.random_state # nested random_state def make_steps(): return [('sel', SelectFromModel(Perceptron(random_state=None))), ('clf', Perceptron(random_state=None))] est1 = Pipeline(make_steps()) _set_random_states(est1, 3) assert isinstance(est1.steps[0][1].estimator.random_state, int) assert isinstance(est1.steps[1][1].random_state, int) assert (est1.get_params()['sel__estimator__random_state'] != est1.get_params()['clf__random_state']) # ensure multiple random_state parameters are invariant to get_params() # iteration order class AlphaParamPipeline(Pipeline): def get_params(self, *args, **kwargs): params = Pipeline.get_params(self, *args, **kwargs).items() return OrderedDict(sorted(params)) class RevParamPipeline(Pipeline): def get_params(self, *args, **kwargs): params = Pipeline.get_params(self, *args, **kwargs).items() return OrderedDict(sorted(params, reverse=True)) for cls in [AlphaParamPipeline, RevParamPipeline]: est2 = cls(make_steps()) _set_random_states(est2, 3) assert (est1.get_params()['sel__estimator__random_state'] == est2.get_params()['sel__estimator__random_state']) assert (est1.get_params()['clf__random_state'] == est2.get_params() ['clf__random_state'])
def test_base_zero_n_estimators(): # Check that instantiating a BaseEnsemble with n_estimators<=0 raises # a ValueError. ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=0) iris = load_iris() assert_raise_message(ValueError, "n_estimators must be greater than zero, got 0.", ensemble.fit, iris.data, iris.target)
def test_gridsearch_pipeline(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model. X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) kpca = KernelPCA(kernel="rbf", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(kernel_pca__gamma=2.**np.arange(-2, 2)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) grid_search.fit(X, y) assert grid_search.best_score_ == 1
def test_ovo_ties2(): # test that ties can not only be won by the first two labels X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y_ref = np.array([2, 0, 1, 2]) # cycle through labels so that each label wins once for i in range(3): y = (y_ref + i) % 3 multi_clf = OneVsOneClassifier( Perceptron(shuffle=False, max_iter=4, tol=None)) ovo_prediction = multi_clf.fit(X, y).predict(X) assert ovo_prediction[0] == i % 3
def test_nested_circles(): # Test the linear separability of the first 2D KPCA transform X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) # 2D nested circles are not linearly separable train_score = Perceptron(max_iter=5).fit(X, y).score(X, y) assert train_score < 0.8 # Project the circles data into the first 2 components of a RBF Kernel # PCA model. # Note that the gamma value is data dependent. If this test breaks # and the gamma value has to be updated, the Kernel PCA example will # have to be updated too. kpca = KernelPCA(kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.) X_kpca = kpca.fit_transform(X) # The data is perfectly linearly separable in that space train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y) assert train_score == 1.0
def test_ovo_ties(): # Test that ties are broken using the decision function, # not defaulting to the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier( Perceptron(shuffle=False, max_iter=4, tol=None)) ovo_prediction = multi_clf.fit(X, y).predict(X) ovo_decision = multi_clf.decision_function(X) # Classifiers are in order 0-1, 0-2, 1-2 # Use decision_function to compute the votes and the normalized # sum_of_confidences, which is used to disambiguate when there is a tie in # votes. votes = np.round(ovo_decision) normalized_confidences = ovo_decision - votes # For the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # For the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # For the tie, the prediction is the class with the highest score assert ovo_prediction[0] == normalized_confidences[0].argmax()
def make_steps(): return [('sel', SelectFromModel(Perceptron(random_state=None))), ('clf', Perceptron(random_state=None))]
import numpy as np import matplotlib.pyplot as plt from mrex import datasets from mrex.model_selection import train_test_split from mrex.linear_model import SGDClassifier, Perceptron from mrex.linear_model import PassiveAggressiveClassifier from mrex.linear_model import LogisticRegression heldout = [0.95, 0.90, 0.75, 0.50, 0.01] rounds = 20 X, y = datasets.load_digits(return_X_y=True) classifiers = [("SGD", SGDClassifier(max_iter=100)), ("ASGD", SGDClassifier(average=True)), ("Perceptron", Perceptron()), ("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge', C=1.0, tol=1e-4)), ("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, tol=1e-4)), ("SAG", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))] xx = 1. - np.array(heldout) for name, clf in classifiers: print("training %s" % name) rng = np.random.RandomState(42)
# Iterator over parsed Reuters SGML files. data_stream = stream_reuters_documents() # We learn a binary classification between the "acq" class and all the others. # "acq" was chosen as it is more or less evenly distributed in the Reuters # files. For other datasets, one should take care of creating a test set with # a realistic portion of positive instances. all_classes = np.array([0, 1]) positive_class = 'acq' # Here are some classifiers that support the `partial_fit` method partial_fit_classifiers = { 'SGD': SGDClassifier(max_iter=5), 'Perceptron': Perceptron(), 'NB Multinomial': MultinomialNB(alpha=0.01), 'Passive-Aggressive': PassiveAggressiveClassifier(), } def get_minibatch(doc_iter, size, pos_class=positive_class): """Extract a minibatch of examples, return a tuple X_text, y. Note: size is before excluding invalid docs with no topics assigned. """ data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics']) for doc in itertools.islice(doc_iter, size) if doc['topics']] if not len(data):
# Split the dataset in training and test set: docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5) # TASK: Build a vectorizer that splits strings into sequence of 1 to 3 # characters instead of word tokens vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', use_idf=False) # TASK: Build a vectorizer / classifier pipeline using the previous analyzer # the pipeline instance should stored in a variable named clf clf = Pipeline([ ('vec', vectorizer), ('clf', Perceptron()), ]) # TASK: Fit the pipeline on the training set clf.fit(docs_train, y_train) # TASK: Predict the outcome on the testing set in a variable named y_predicted y_predicted = clf.predict(docs_test) # Print the classification report print( metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names)) # Plot the confusion matrix
def test_undefined_methods(): clf = Perceptron(max_iter=100) for meth in ("predict_proba", "predict_log_proba"): assert_raises(AttributeError, lambda x: getattr(clf, x), meth)
def test_perceptron_accuracy(): for data in (X, X_csr): clf = Perceptron(max_iter=100, tol=None, shuffle=False) clf.fit(data, y) score = clf.score(data, y) assert score > 0.7