def test_transform(): """Check transform method of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft').fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=True).fit(X, y) eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=False).fit(X, y) warn_msg = ("'flatten_transform' default value will be " "changed to True in 0.21. " "To silence this warning you may" " explicitly set flatten_transform=False.") res = assert_warns_message(DeprecationWarning, warn_msg, eclf1.transform, X) assert_array_equal(res.shape, (3, 4, 2)) assert_array_equal(eclf2.transform(X).shape, (4, 6)) assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) assert_array_almost_equal( res.swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X))
def test_set_estimator_none(drop): """VotingClassifier set_params should be able to set estimators as None or drop""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) with pytest.warns(None) as record: eclf2.set_params(rf=drop).fit(X, y) assert record if drop is None else not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] is drop assert len(eclf2.estimators_) == 2 assert all(isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] is drop eclf1.set_params(voting='soft').fit(X, y) with pytest.warns(None) as record: eclf2.set_params(voting='soft').fit(X, y) assert record if drop is None else not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are dropped. At least one is required' with pytest.warns(None) as record: with pytest.raises(ValueError, match=msg): eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y) assert record if drop is None else not record # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) with pytest.warns(None) as record: eclf2.set_params(rf=drop).fit(X1, y1) assert record if drop is None else not record assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_transform(): """Check transform method of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft').fit(X, y) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=True).fit(X, y) eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=False).fit(X, y) warn_msg = ("'flatten_transform' default value will be " "changed to True in 0.21. " "To silence this warning you may" " explicitly set flatten_transform=False.") res = assert_warns_message(DeprecationWarning, warn_msg, eclf1.transform, X) assert_array_equal(res.shape, (3, 4, 2)) assert_array_equal(eclf2.transform(X).shape, (4, 6)) assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) assert_array_almost_equal(res.swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) )
class deepForestLayer(ClassifierMixin): """A Really hacky and WIP implementation of the layers required for a Cascade Forest""" def __init__(self, n_nodes, nClasses=1,output=False): self.output=output self.nClasses = nClasses self.n_nodes= n_nodes nrfs = int(n_nodes/2) nefs = n_nodes - nrfs self.estimators = [] self.final_voters = [] for i in range(nefs): self.estimators.append(('ET'+str(i),ExtraTreesClassifier(n_estimators=1000,min_samples_leaf=10, n_jobs=-1))) for i in range(nrfs): self.estimators.append(('RF'+str(i),RandomForestClassifier(n_estimators=1000,min_samples_leaf=10,n_jobs=-1))) self.voter = VotingClassifier(estimators=self.estimators, voting='soft') def fit_Kfold(self,X_train, X_test,y_train, y_test): """This function implements the growing and validation to determine number of layers required""" fold = KFold() # 3-Fold CV train_preds = np.empty((3,X_train.shape[0],self.nClasses*self.n_nodes)) train_preds[:] = np.nan est_preds = np.empty((3,X_test.shape[0],self.nClasses*self.n_nodes)) est_preds[:] = np.nan i=0 for train_idx, test_idx in fold.split(X_train): self.voter.fit(X_train[train_idx],y_train[train_idx]) #Fit each of the estimators to our data #voter.transform has shape number of estimators (4 in this case) x no of samples x number of classes. #Insample is the transform output reshaped to have shape no of samples x number of classes*number of estimators insample = self.voter.transform(X_train[train_idx]).swapaxes(0,1).reshape((X_train[train_idx].shape[0],-1)) outsample = self.voter.transform(X_test).swapaxes(0,1).reshape((X_test.shape[0],-1)) #Insample is the training error, outsample is the validation error. train_preds[i,train_idx] = insample.copy() est_preds[i] = outsample.copy() i+=1 #As I used KFold, train_preds and est_preds have two valid entries and one nan entry per data point #average this dimension so we get one probability prediction per data point return np.nanmean(train_preds,axis=0),np.nanmean(est_preds,axis=0) def fit(self, X,y): """This function does a full fit once the number of layers has been decided""" fold = KFold() #Create 3 models, each fitted on a fold of the data. #This is only way I can think of getting required output at prediction stage. for train_idx, test_idx in fold.split(X): clf=VotingClassifier(estimators=copy.deepcopy(self.estimators), voting='soft') clf.fit(X[train_idx], y[train_idx]) self.final_voters.append(clf) def predict(self, X): """Make predictions, using models fitted by KFold""" preds = np.zeros((len(self.final_voters),X.shape[0],self.nClasses)) for i in range(len(self.final_voters)): preds[i]=self.final_voters[i].predict_proba(X) return np.mean(preds,axis=0)
def test_set_estimator_none(): """VotingClassifier set_params should be able to set estimators as None""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=None).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_true(dict(eclf2.estimators)["rf"] is None) assert_true(len(eclf2.estimators_) == 2) assert_true( all([ not isinstance(est, RandomForestClassifier) for est in eclf2.estimators_ ])) assert_true(eclf2.get_params()["rf"] is None) eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = ('All estimators are None. At least one is required' ' to be a classifier!') assert_raise_message(ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=None).fit(X1, y1) assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_set_estimator_none(): """VotingClassifier set_params should be able to set estimators as None""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=None).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_true(dict(eclf2.estimators)["rf"] is None) assert_true(len(eclf2.estimators_) == 2) assert_true(all([not isinstance(est, RandomForestClassifier) for est in eclf2.estimators_])) assert_true(eclf2.get_params()["rf"] is None) eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = ('All estimators are None. At least one is required' ' to be a classifier!') assert_raise_message( ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=None).fit(X1, y1) assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
class VotingClassifierImpl(): def __init__(self, estimators=None, voting='hard', weights=None, n_jobs=None, flatten_transform=True): self._hyperparams = { 'estimators': estimators, 'voting': voting, 'weights': weights, 'n_jobs': n_jobs, 'flatten_transform': flatten_transform } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def test_notfitted(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='soft') ereg = VotingRegressor([('dr', DummyRegressor())]) msg = ("This %s instance is not fitted yet. Call \'fit\'" " with appropriate arguments before using this estimator.") with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'): eclf.predict(X) with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'): eclf.predict_proba(X) with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'): eclf.transform(X) with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'): ereg.predict(X_r) with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'): ereg.transform(X_r)
def test_transform(): """Check transform method of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft').fit(X, y) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=True).fit(X, y) eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=False).fit(X, y) assert_array_equal(eclf1.transform(X).shape, (4, 6)) assert_array_equal(eclf2.transform(X).shape, (4, 6)) assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) )
def test_transform(): """Check transform method of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft').fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=True).fit(X, y) eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=False).fit(X, y) assert_array_equal(eclf1.transform(X).shape, (4, 6)) assert_array_equal(eclf2.transform(X).shape, (4, 6)) assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X))
def test_notfitted(): eclf = VotingClassifier( estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())], voting="soft", ) ereg = VotingRegressor([("dr", DummyRegressor())]) msg = ("This %s instance is not fitted yet. Call 'fit'" " with appropriate arguments before using this estimator.") with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): eclf.predict(X) with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): eclf.predict_proba(X) with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): eclf.transform(X) with pytest.raises(NotFittedError, match=msg % "VotingRegressor"): ereg.predict(X_r) with pytest.raises(NotFittedError, match=msg % "VotingRegressor"): ereg.transform(X_r)
def voting(X_train, y_train, estimators, X_test, y_test): ## trains with all models !! seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) ensemble = VotingClassifier(estimators).fit(X_train, y_train) results = ensemble.score(X_test, y_test) y_df = ensemble.transform(X_test) y_pred = ensemble.predict(X_test) precicions, recall, t = precision_recall_curve(y_test, y_df, pos_label=1) print(precicions[:10], recall[:10], t[:10]) precision = precicions[0] confmat = confusion_matrix(y_test, y_pred) return results, precision, confmat
def committee_classify(features_train, labels_train, features_test, classifier="d_tree", n_classifiers=5): """ Using an ensamble committee, classifies and returns calculated entropies for given test features. :param numpy.array features_train: array with features to train the classifiers :param list of int labels_train: list of labels to train the classifiers :param numpy.array features_test: array with features to be classified :param str classifier: the classifier type to be used in the committee. Valid values: "d_tree", "nb", "svm" :param int n_classifiers: number of classifiers to be created in the committee :return: a tuple with predictions and calculated entropies :rtype: (list of int, numpy.array) """ def d_tree(randomness): return DecisionTreeClassifier(criterion="entropy", splitter="random", random_state=randomness) def nb(randomness): return MultinomialNB(alpha=randomness * 0.5) def svm(randomness): kernels = [("linear", 0), ("poly", 2), ("poly", 3), ("rbf", 0), ("sigmoid", 0)] return SVC(kernel=kernels[randomness % len(kernels)][0], degree=kernels[randomness % len(kernels)][1]) classifier_calls = {"d_tree": d_tree, "nb": nb, "svm": svm} # Necessary to scale features for svm if classifier == "svm": scaler = preprocessing.StandardScaler().fit(features_train) features_train = scaler.transform(features_train) features_test = scaler.transform(features_test) estimators = [] for i in range(n_classifiers): # Initialize classifiers for the ensamble estimators.append((classifier + str(i), classifier_calls[classifier](i))) # Fit and predict eclf = VotingClassifier(estimators=estimators, voting="hard", n_jobs=-1) eclf.fit(features_train, labels_train) prediction = eclf.predict(features_test) # Calculate entropies. Individual votes array needs to transpose for calculating entropies with classifiers in # the axis 0. individual_votes = numpy.transpose(eclf.transform(features_test)) entropies = calculate_entropies(individual_votes) return prediction, entropies
def test_get_features_names_out_classifier(kwargs, expected_names): """Check get_feature_names_out for classifier for different settings.""" X = [[1, 2], [3, 4], [5, 6], [1, 1.2]] y = [0, 1, 2, 0] voting = VotingClassifier( estimators=[ ("lr", LogisticRegression(random_state=0)), ("tree", DecisionTreeClassifier(random_state=0)), ], **kwargs, ) voting.fit(X, y) X_trans = voting.transform(X) names_out = voting.get_feature_names_out() assert X_trans.shape[1] == len(expected_names) assert_array_equal(names_out, expected_names)
from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, VotingClassifier clf1 = LogisticRegression(multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') eclf1 = eclf1.fit(X, y) print(eclf1.predict(X)) np.array_equal(eclf1.named_estimators_.lr.predict(X), eclf1.named_estimators_['lr'].predict(X)) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') eclf2 = eclf2.fit(X, y) print(eclf2.predict(X)) eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2,1,1], flatten_transform=True) eclf3 = eclf3.fit(X, y) print(eclf3.predict(X)) print(eclf3.transform(X).shape)
class VotingWeightSearchCV(BaseEstimator, ClassifierMixin, TransformerMixin): """ Soft voting classifier that chooses weights based on test dataset """ def __init__(self, estimators, test_size=0.33, starting_weights=None, verbose=0, random_state=None, refit=False): self.test_size = test_size self.estimators = estimators self.verbose = verbose self.random_state = random_state self.refit = refit if starting_weights is not None: self.starting_weights = starting_weights else: self.starting_weights = [0.5] * len(estimators) self.best_estimator_ = None self.weights_ = None self.peak_score_ = None def _log(self, msg, verbosity=0): if self.verbose >= verbosity: print("{pre} {ind}{msg}".format(pre="(SW)", ind="".join([" "] * verbosity), msg=msg)) def fit(self, X, y): """Train and find the optimum weights. https://www.kaggle.com/hsperr/otto-group-product-classification-challenge/finding-ensamble-weights/code https://www.kaggle.com/sushanttripathy/otto-group-product-classification-challenge/wrapper-for-models-ensemble/code """ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_size, random_state=self.random_state, stratify=y) fitted_estimators = [] predictions = [] def log_loss_func(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return log_loss(y_test, final_prediction) # Fit on train set self._log("Fitting on train subset...") for label, clf in self.estimators: self._log("fitting {0}...".format(label), 1) fitted_clf = clone(clf).fit(X_train, y_train) fitted_estimators.append((label, fitted_clf)) # Predict on test set self._log("Predict on test subset...") for label, clf in fitted_estimators: self._log("predict using {0}...".format(label), 1) predictions.append(clf.predict_proba(X_test)) # Search weights self._log("Searching weights...") cons = ({"type": "eq", "fun": lambda w: 1 - sum(w)}) bounds = [(0, 1)] * len(predictions) res = minimize(log_loss_func, self.starting_weights, method="SLSQP", bounds=bounds, constraints=cons) self.weights_ = list(res["x"]) self.peak_score_ = res["fun"] self._log("Best weights: {0}".format(self.weights_), 1) self._log("Peak score: {0}".format(self.peak_score_), 1) # Build voting classifier self.best_estimator_ = VotingClassifier(estimators=self.estimators, voting="soft", weights=self.weights_) if self.refit: self._log("Refitting using best weights...") self.best_estimator_.fit(X, y) return self def predict(self, X): return self.best_estimator_.predict(X) def predict_proba(self, X): return self.best_estimator_.predict_proba(X) def transform(self, X): return self.best_estimator_.transform(X)
def test_set_estimator_drop(): # VotingClassifier set_params should be able to set estimators as drop # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier( estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)], voting="hard", weights=[1, 0, 0.5], ).fit(X, y) eclf2 = VotingClassifier( estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)], voting="hard", weights=[1, 1, 0.5], ) eclf2.set_params(rf="drop").fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] == "drop" assert len(eclf2.estimators_) == 2 assert all( isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] == "drop" eclf1.set_params(voting="soft").fit(X, y) eclf2.set_params(voting="soft").fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = "All estimators are dropped. At least one is required" with pytest.raises(ValueError, match=msg): eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier( estimators=[("rf", clf2), ("nb", clf3)], voting="soft", weights=[0, 0.5], flatten_transform=False, ).fit(X1, y1) eclf2 = VotingClassifier( estimators=[("rf", clf2), ("nb", clf3)], voting="soft", weights=[1, 0.5], flatten_transform=False, ) eclf2.set_params(rf="drop").fit(X1, y1) assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]), ) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]])) eclf1.set_params(voting="hard") eclf2.set_params(voting="hard") assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
class VotingWeightSearchCV(BaseEstimator, ClassifierMixin, TransformerMixin): """ Soft voting classifier that chooses weights based on test dataset """ def __init__(self, estimators, test_size=0.33, starting_weights=None, verbose=0, random_state=None, refit=False): self.test_size = test_size self.estimators = estimators self.verbose = verbose self.random_state = random_state self.refit = refit if starting_weights is not None: self.starting_weights = starting_weights else: self.starting_weights = [0.5] * len(estimators) self.best_estimator_ = None self.weights_ = None self.peak_score_ = None def _log(self, msg, verbosity=0): if self.verbose >= verbosity: print "{pre} {ind}{msg}".format( pre = "(SW)", ind = "".join([" "] * verbosity), msg = msg ) def fit(self, X, y): """Train and find the optimum weights. https://www.kaggle.com/hsperr/otto-group-product-classification-challenge/finding-ensamble-weights/code https://www.kaggle.com/sushanttripathy/otto-group-product-classification-challenge/wrapper-for-models-ensemble/code """ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = self.test_size, random_state = self.random_state, stratify = y ) fitted_estimators = [] predictions = [] def log_loss_func(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return log_loss(y_test, final_prediction) # Fit on train set self._log("Fitting on train subset...") for label, clf in self.estimators: self._log("fitting {0}...".format(label), 1) fitted_clf = clone(clf).fit(X_train, y_train) fitted_estimators.append((label, fitted_clf)) # Predict on test set self._log("Predict on test subset...") for label, clf in fitted_estimators: self._log("predict using {0}...".format(label), 1) predictions.append(clf.predict_proba(X_test)) # Search weights self._log("Searching weights...") cons = ({"type": "eq", "fun": lambda w: 1 - sum(w)}) bounds = [(0,1)]*len(predictions) res = minimize( log_loss_func, self.starting_weights, method = "SLSQP", bounds = bounds, constraints = cons ) self.weights_ = list(res["x"]) self.peak_score_ = res["fun"] self._log("Best weights: {0}".format(self.weights_), 1) self._log("Peak score: {0}".format(self.peak_score_), 1) # Build voting classifier self.best_estimator_ = VotingClassifier( estimators = self.estimators, voting = "soft", weights = self.weights_ ) if self.refit: self._log("Refitting using best weights...") self.best_estimator_.fit(X, y) return self def predict(self, X): return self.best_estimator_.predict(X) def predict_proba(self, X): return self.best_estimator_.predict_proba(X) def transform(self, X): return self.best_estimator_.transform(X)
def vote(debug=False): """ 本地的fptp,大概是在0.60,预计线上比较低的 === :param debug: :return: """ train_path = 'data/atec_anti_fraud_train.csv' test_path = 'data/atec_anti_fraud_test_a.csv' if debug: nrows = 100000 else: nrows = 10000 * 10000 logging.info('begin main') train_df = pd.read_csv(train_path, nrows=nrows) train_df = train_df[train_df['label'] != -1] test_df = pd.read_csv(test_path, nrows=nrows) test_df['label'] = -2 df = factorize(train_df, test_df) ### 关于 rate的特别处理 . df['frate_1'] = df['f83'] / (df['f84'] + 1) df['frate_2'] = df['f85'] / (df['f84'] + 1) df['frate_3'] = df['f86'] / (df['f84'] + 1) df['frate_82_84'] = df['f82'] / (df['f84'] + 1) df['frate_4'] = df['f82'] / (df['f85'] + 1) df['frate_5'] = df['f82'] / (df['f86'] + 1) df['frate_6'] = df['f85'] / (df['f86'] + 1) train_df = df[df['label'] != -2] test_df = df[df['label'] == -2] logging.info('traindf shape = {}'.format(train_df.shape)) logging.info('testdf shape = {} '.format(test_df.shape)) y = train_df.pop('label') train_df.pop('id') train_df.pop('date') test_id = test_df.pop('id') test_df.pop('date') test_df.pop('label') # y_test = df_test.pop('label') # # df_test.pop('id') # df_test.pop('date') # # X_train = df_train # X_test = df_test X = train_df logging.info("will train-test split") logging.info("fitting..") cls1 = lgb.LGBMClassifier(objective='binary', n_estimators=100, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, num_leaves=31, learning_rate=0.05, silent=False) rf1 = lgb.LGBMClassifier(boosting_type='rf', objective='binary', n_estimators=200, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, num_leaves=31, learning_rate=0.05, silent=False) rf2 = lgb.LGBMClassifier(boosting_type='rf', objective='binary', n_estimators=400, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, num_leaves=31, learning_rate=0.05, silent=False) cb = catboost.CatBoostClassifier(iterations=100, learning_rate=0.05, depth=6, loss_function='Logloss') vc = VotingClassifier(estimators=[('cb', cb), ('gbdt', cls1), ('rf200', rf1), ('rf400', rf2)], voting='soft', flatten_transform=False) vc.fit(X, y) # test_df = train_df # (n_classifiers, n_samples, n_classes) vc_score = vc.transform(test_df) #type: np.ndarray test_labels = vc.predict(test_df) n_classifier, n_sample, n_classes = vc_score.shape logging.info('orig shape = {}, test_labels = {}, test_mean = {} '.format( vc_score.shape, test_labels.shape, np.mean(test_labels))) vc_score = vc_score.swapaxes(0, 1) logging.info('swapaxeis shape = {}'.format(vc_score.shape)) score = [] for i in range(n_sample): buf = [] for j in range(n_classifier): k = np.argmax(vc_score[i][j], 0) if k == test_labels[i]: buf.append(vc_score[i][j][1]) s = np.mean(buf, axis=0) score.append(s) # logging.info('train metric = {}'.format(metric(y, score))) day = today() pd.DataFrame({ 'id': test_id, 'score': score }).to_csv('{}.{}.submit.csv'.format(day, 'vc'), index=False, float_format='%.6f') logging.info('done')
class LabelClassifier: """Class implemens various label Classifiers """ def __init__(self, categoryToClassify: list, pretrained=None): """Constructor for Label Classier Args: categoryToClassify (list): data to save pretrained ([type], optional): Pretrained classifier. Defaults to None. """ if not categoryToClassify: raise ("no categories to classify have been provided") self.category: list = categoryToClassify self.estimators = estimators=[('MultinomialNB', MultinomialNB()), \ ('SGDClassifier', SGDClassifier(loss='modified_huber', penalty='l2',alpha=1e-3, random_state=100, max_iter=200)), ('sigmoidSVM', SVC(kernel='sigmoid', gamma=1.0)), ('RandomForest', RandomForestClassifier(200, bootstrap=False)), ('LogisticRegression',LogisticRegression(solver='sag',random_state=100))] self.trainedEstimator = pretrained self.fileLocation: str = self.generateFilename() self.stackingEstimator = None self.rbfKernel = None def trainingClassifier(self, X_train: numpy.ndarray, y_train: numpy.ndarray): """Constructor for Label Classier Args: X_train (numpy.ndarray): X_train training documents y_train (numpy.ndarray): y_train labels for training documents """ if not X_train.size: raise ("No X_train data was provided") if not y_train.size: raise ("No y_train data was provided") logging.info("> training classifier") voting = None if config.getValueFromConfig("classifier loadClassifier") == True: try: self.trainedEstimator = joblib.load(self.fileLocation) voting = load_classifier.getVotingClassifier() except: raise ("load voting classifier failed") else: self.trainedEstimator = VotingClassifier(self.estimators, voting='hard') voting = self.trainedEstimator.fit_transform( X_train, y_train) # test our model on the test data if config.getValueFromConfig("classifier saveClassifier") == True: joblib.dump(self.trainedEstimator, self.fileLocation, compress=9) joblib.dump( voting, '../classifier/trained_classifiers/voting_classifier', compress=9) logging.info("> dumped Classifier: {}".format( self.fileLocation)) self.trainKernelApproxSvgOnVoting(voting, y_train) def predict(self, X_test: numpy.ndarray) -> numpy.ndarray: """Method labels data Args: X_test (numpy.ndarray): X_test data Returns: numpy.ndarray: Trained estimator prediction """ if not X_test.size: raise ("No test documents were provided") logging.info("> predicting") prediction = self.trainedEstimator.predict(X_test) assert prediction.size, "No documents were predicted" return prediction def generateFilename(self) -> str: """Method generates Filename for classifier Returns: str: Filename as string """ folder = config.getValueFromConfig("classifier path saveFolder") if folder == None: raise ("No folder name was provided") if len(self.category) < 2 or len(self.category) > 3: raise ("To few or many categories") if len(self.category) == 3: return "{}ensembleClassifier_{}-{}-{}.joblib.pkl".format( folder, self.category[0], self.category[1], self.category[2]) else: return "{}ensembleClassifier_{}-{}.joblib.pkl".format( folder, self.category[0], self.category[1]) def accuracy(self, X_test: numpy.ndarray, y_test: numpy.ndarray, predicted: numpy.ndarray): """Methods plots the accuracy of the trained classifier Args: X_test (numpy.ndarray): The test documents y_test (numpy.ndarray): The results for the test documents predicted (numpy.ndarray): The predicted test values Raises: AssertionError: This error is being thrown, if the classifier wasn't trained previousely """ if not X_test.size: raise ("X_test was empty") if not y_test.size: raise ("y_test was empty") if not predicted.size: raise ("predicted was empty") if self.trainedEstimator == None: raise AssertionError("Classifier has not been trained yet") logging.info("\n ->> ensemble-score:{}\n".format( numpy.mean(predicted == y_test))) plot_confusion_matrix( self.trainedEstimator, X_test, y_test, normalize="all", display_labels=[self.category[0], self.category[1]]) plt.show() def trainKernelApproxSvgOnVoting(self, X_predicted: numpy.ndarray, y: numpy.ndarray): """Train kernel for classifier Args: X_predicted (numpy.ndarray): The prediction of the other classifiers. y (numpy.ndarray): The real labels. """ if not X_predicted.size: raise ("No X_predicted data was orovided") if not y.size: raise ("No y data was provided") logging.info("training stacking classifier") self.rbfKernel = RBFSampler(gamma=1, random_state=1) X_features = self.rbfKernel.fit_transform(X_predicted) self.stackingEstimator = SGDClassifier( max_iter=config.getValueFromConfig("SGDClassifierIterations")) self.stackingEstimator.fit(X_features, y) logging.info("stacking-classifier: " + str(self.stackingEstimator.score(X_features, y))) def stackingPrediction(self, X_test: numpy.ndarray) -> numpy.ndarray: """This method predicts the result using another classifier - so called "stacking" Args: X_test (numpy.ndarray): The vectorized documents to test on. Returns: numpy.ndarray: The prediction for the labels using stacking. """ if not X_test.size: raise ("No X_test data was provided") voting = self.trainedEstimator.transform(X_test) influencedVoting = self.rbfKernel.transform(voting) prediction = self.stackingEstimator.predict(influencedVoting) assert prediction.size return prediction
def test_set_estimator_drop(): # VotingClassifier set_params should be able to set estimators as drop # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(rf='drop').fit(X, y) assert not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] == 'drop' assert len(eclf2.estimators_) == 2 assert all( isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] == 'drop' eclf1.set_params(voting='soft').fit(X, y) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(voting='soft').fit(X, y) assert not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are dropped. At least one is required' with pytest.warns(None) as record: with pytest.raises(ValueError, match=msg): eclf2.set_params(lr='drop', rf='drop', nb='drop').fit(X, y) assert not record # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(rf='drop').fit(X1, y1) assert not record assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
# Performance evaluation for clf in (clf1, clf2, clf3, eclf2): clf.fit(X, y) y_pred = clf.predict(X) print(clf.__class__.__name__, accuracy_score(y, y_pred)) print() # prdicted result print('eclf2.predict(X) = \n{0}\n'.format(eclf2.predict(X))) print( '---< fitting: voting="soft", weight=[2, 1, 1], flatten_transform=True >---' ) eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2, 1, 1], flatten_transform=True) # Performance evaluation for clf in (clf1, clf2, clf3, eclf3): clf.fit(X, y) y_pred = clf.predict(X) print(clf.__class__.__name__, accuracy_score(y, y_pred)) print() # prdicted result print('eclf3.predict(X) = \n{0}\n'.format(eclf3.predict(X))) print('eclf3.transform(X).shape = {0}\n'.format(eclf3.transform(X).shape))