def test_tie_situation(): """Check voting classifier selects smaller class label in tie situation.""" clf1 = LogisticRegression(random_state=123, solver='liblinear') clf2 = RandomForestClassifier(random_state=123) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard') assert clf1.fit(X, y).predict(X)[73] == 2 assert clf2.fit(X, y).predict(X)[73] == 1 assert eclf.fit(X, y).predict(X)[73] == 1
def test_predictproba_hardvoting(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='hard') msg = "predict_proba is not available when voting='hard'" with pytest.raises(AttributeError, match=msg): eclf.predict_proba assert not hasattr(eclf, "predict_proba") eclf.fit(X, y) assert not hasattr(eclf, "predict_proba")
def test_sample_weight_kwargs(): """Check that VotingClassifier passes sample_weight as kwargs""" class MockClassifier(BaseEstimator, ClassifierMixin): """Mock Classifier to check that sample_weight is received as kwargs""" def fit(self, X, y, *args, **sample_weight): assert 'sample_weight' in sample_weight clf = MockClassifier() eclf = VotingClassifier(estimators=[('mock', clf)], voting='soft') # Should not raise an error. eclf.fit(X, y, sample_weight=np.ones((len(y),)))
def test_multilabel(): """Check if error is raised for multilabel classification.""" X, y = make_multilabel_classification(n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123) clf = OneVsRestClassifier(SVC(kernel='linear')) eclf = VotingClassifier(estimators=[('ovr', clf)], voting='hard') try: eclf.fit(X, y) except NotImplementedError: return
def test_estimator_weights_format(): # Test estimator weights inputs as list and array clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2)], weights=[1, 2], voting='soft') eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2)], weights=np.array((1, 2)), voting='soft') eclf1.fit(X, y) eclf2.fit(X, y) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def test_majority_label_iris(): """Check classification by majority label on dataset iris.""" clf1 = LogisticRegression(solver='liblinear', random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') scores = cross_val_score(eclf, X, y, scoring='accuracy') assert_almost_equal(scores.mean(), 0.95, decimal=2)
def test_weights_iris(): """Check classification by average probabilities on dataset iris.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 2, 10]) scores = cross_val_score(eclf, X, y, scoring='accuracy') assert_almost_equal(scores.mean(), 0.93, decimal=2)
def test_estimator_init(): eclf = VotingClassifier(estimators=[]) msg = ('Invalid `estimators` attribute, `estimators` should be' ' a list of (string, estimator) tuples') assert_raise_message(AttributeError, msg, eclf.fit, X, y) clf = LogisticRegression(random_state=1) eclf = VotingClassifier(estimators=[('lr', clf)], voting='error') msg = ('Voting must be \'soft\' or \'hard\'; got (voting=\'error\')') assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('lr', clf)], weights=[1, 2]) msg = ('Number of `estimators` and weights must be equal' '; got 2 weights, 1 estimators') assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('lr', clf), ('lr', clf)], weights=[1, 2]) msg = "Names provided are not unique: ['lr', 'lr']" assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('lr__', clf)]) msg = "Estimator names must not contain __: got ['lr__']" assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('estimators', clf)]) msg = "Estimator names conflict with constructor arguments: ['estimators']" assert_raise_message(ValueError, msg, eclf.fit, X, y)
def test_predict_on_toy_problem(): """Manually check predicted class labels for toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]) y = np.array([1, 1, 1, 2, 2, 2]) assert all(clf1.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2]) assert all(clf2.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2]) assert all(clf3.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2]) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[1, 1, 1]) assert all(eclf.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2]) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 1]) assert all(eclf.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2])
def test_gridsearch(): """Check GridSearch support.""" clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') params = {'lr__C': [1.0, 100.0], 'voting': ['soft', 'hard'], 'weights': [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]]} grid = GridSearchCV(estimator=eclf, param_grid=params) grid.fit(iris.data, iris.target)
def test_notfitted(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='soft') ereg = VotingRegressor([('dr', DummyRegressor())]) msg = ("This %s instance is not fitted yet. Call \'fit\'" " with appropriate arguments before using this method.") assert_raise_message(NotFittedError, msg % 'VotingClassifier', eclf.predict, X) assert_raise_message(NotFittedError, msg % 'VotingClassifier', eclf.predict_proba, X) assert_raise_message(NotFittedError, msg % 'VotingClassifier', eclf.transform, X) assert_raise_message(NotFittedError, msg % 'VotingRegressor', ereg.predict, X_r) assert_raise_message(NotFittedError, msg % 'VotingRegressor', ereg.transform, X_r)
def test_parallel_fit(): """Check parallel backend of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', n_jobs=1).fit(X, y) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', n_jobs=2).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def test_predict_proba_on_toy_problem(): """Calculate predicted probabilities on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) clf1_res = np.array([[0.59790391, 0.40209609], [0.57622162, 0.42377838], [0.50728456, 0.49271544], [0.40241774, 0.59758226]]) clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]]) clf3_res = np.array([[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0., 1.], [0., 1.]]) t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4 t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4 t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4 t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4 eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2, 1, 1]) eclf_res = eclf.fit(X, y).predict_proba(X) assert_almost_equal(t00, eclf_res[0][0], decimal=1) assert_almost_equal(t11, eclf_res[1][1], decimal=1) assert_almost_equal(t21, eclf_res[2][1], decimal=1) assert_almost_equal(t31, eclf_res[3][1], decimal=1) with pytest.raises( AttributeError, match="predict_proba is not available when voting='hard'"): eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') eclf.fit(X, y).predict_proba(X)
def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones((len(y),))) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y),)) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) # check that an error is raised and indicative if sample_weight is not # supported. clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator KNeighborsClassifier does not support ' 'sample weights.') with pytest.raises(ValueError, match=msg): eclf3.fit(X, y, sample_weight) # check that _parallel_fit_estimator will raise the right error # it should raise the original error if this is not linked to sample_weight class ClassifierErrorFit(BaseEstimator, ClassifierMixin): def fit(self, X, y, sample_weight): raise TypeError('Error unrelated to sample_weight.') clf = ClassifierErrorFit() with pytest.raises(TypeError, match='Error unrelated to sample_weight'): clf.fit(X, y, sample_weight=sample_weight)
def test_transform(): """Check transform method of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft').fit(X, y) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=True).fit(X, y) eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=False).fit(X, y) assert_array_equal(eclf1.transform(X).shape, (4, 6)) assert_array_equal(eclf2.transform(X).shape, (4, 6)) assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) )
def test_set_estimator_none(drop): """VotingClassifier set_params should be able to set estimators as None or drop""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=drop).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] is drop assert len(eclf2.estimators_) == 2 assert all(isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] is drop eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are None or "drop". At least one is required!' assert_raise_message( ValueError, msg, eclf2.set_params(lr=drop, rf=drop, nb=drop).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=drop).fit(X1, y1) assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
assert_array_equal(eclf1.transform(X).shape, (4, 6)) assert_array_equal(eclf2.transform(X).shape, (4, 6)) assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) ) @pytest.mark.parametrize( "X, y, voter", [(X, y, VotingClassifier( [('lr', LogisticRegression()), ('rf', RandomForestClassifier(n_estimators=5))])), (X_r, y_r, VotingRegressor( [('lr', LinearRegression()), ('rf', RandomForestRegressor(n_estimators=5))]))] ) @pytest.mark.parametrize("drop", [None, 'drop']) def test_none_estimator_with_weights(X, y, voter, drop): # check that an estimator can be set to None and passing some weight # regression test for # https://github.com/scikit-learn/scikit-learn/issues/13777 voter.fit(X, y, sample_weight=np.ones(y.shape)) voter.set_params(lr=drop) voter.fit(X, y, sample_weight=np.ones(y.shape)) y_pred = voter.predict(X) assert y_pred.shape == y.shape
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert 'lr' in eclf1.named_estimators assert eclf1.named_estimators.lr is eclf1.estimators[0][1] assert eclf1.named_estimators.lr is eclf1.named_estimators['lr'] eclf1.fit(X, y) assert 'lr' in eclf1.named_estimators_ assert eclf1.named_estimators_.lr is eclf1.estimators_[0] assert eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'] eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert not hasattr(eclf2, 'nb') assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert eclf2.estimators[0][1].get_params() == clf1.get_params() assert eclf2.estimators[1][1].get_params() == clf2.get_params() eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert eclf1.estimators[0][1].get_params()['C'] == 10.0 assert eclf2.estimators[1][1].get_params()['max_depth'] == 5 assert (eclf1.get_params()["lr__C"] == eclf1.get_params()["lr"].get_params()['C'])
from mrex.tree import DecisionTreeClassifier from mrex.neighbors import KNeighborsClassifier from mrex.svm import SVC from mrex.ensemble import VotingClassifier # Loading some example data iris = datasets.load_iris() X = iris.data[:, [0, 2]] y = iris.target # Training classifiers clf1 = DecisionTreeClassifier(max_depth=4) clf2 = KNeighborsClassifier(n_neighbors=7) clf3 = SVC(gamma=.1, kernel='rbf', probability=True) eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[2, 1, 2]) clf1.fit(X, y) clf2.fit(X, y) clf3.fit(X, y) eclf.fit(X, y) # Plotting decision regions x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))
import numpy as np import matplotlib.pyplot as plt from mrex.linear_model import LogisticRegression from mrex.naive_bayes import GaussianNB from mrex.ensemble import RandomForestClassifier from mrex.ensemble import VotingClassifier clf1 = LogisticRegression(max_iter=1000, random_state=123) clf2 = RandomForestClassifier(n_estimators=100, random_state=123) clf3 = GaussianNB() X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 5]) # predict class probabilities for all classifiers probas = [c.fit(X, y).predict_proba(X) for c in (clf1, clf2, clf3, eclf)] # get class probabilities for the first sample in the dataset class1_1 = [pr[0, 0] for pr in probas] class2_1 = [pr[0, 1] for pr in probas] # plotting N = 4 # number of groups ind = np.arange(N) # group positions width = 0.35 # bar width