def test_set_estimator_none(drop): """VotingClassifier set_params should be able to set estimators as None or drop""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) with pytest.warns(None) as record: eclf2.set_params(rf=drop).fit(X, y) assert record if drop is None else not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] is drop assert len(eclf2.estimators_) == 2 assert all(isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] is drop eclf1.set_params(voting='soft').fit(X, y) with pytest.warns(None) as record: eclf2.set_params(voting='soft').fit(X, y) assert record if drop is None else not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are dropped. At least one is required' with pytest.warns(None) as record: with pytest.raises(ValueError, match=msg): eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y) assert record if drop is None else not record # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) with pytest.warns(None) as record: eclf2.set_params(rf=drop).fit(X1, y1) assert record if drop is None else not record assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def ensembleModel(self, list_of_models, train, cross_validation): logging.info( "preparing Target Variable for train and cross validation") train_Y = train[self.target] cross_validation_Y = cross_validation[self.target] logging.info("preparing train and CV data") train_X = train[train.columns.difference([self.ID, self.target])] cross_validation_X = cross_validation[ cross_validation.columns.difference([self.ID, self.target])] clf = VotingClassifier(estimators=list_of_models, voting='soft', weights=[1, 5]) clf.fit(train_X, train_Y) self.saveModel(clf, "rfc_and_xgb_model") print("accurary on cross_validation set", clf.score(cross_validation_X, cross_validation_Y)) print('Overall RFC AUC on whole train set:', roc_auc_score(train_Y, clf.predict_proba(train_X)[:, 1])) print( 'Overall RFC AUC on whole cross_validation set:', roc_auc_score(cross_validation_Y, clf.predict_proba(cross_validation_X)[:, 1])) return clf
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert_true('lr' in eclf1.named_estimators) assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1]) assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr']) eclf1.fit(X, y) assert_true('lr' in eclf1.named_estimators_) assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0]) assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_false(hasattr(eclf2, 'nb')) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params()) assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params()) eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0) assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5) assert_equal(eclf1.get_params()["lr__C"], eclf1.get_params()["lr"].get_params()['C'])
def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones((len(y),))) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y),)) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator \'knn\' does not support sample weights.') assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(gamma='scale', probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones( (len(y), ))) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y), )) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator \'knn\' does not support sample weights.') assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert 'lr' in eclf1.named_estimators assert eclf1.named_estimators.lr is eclf1.estimators[0][1] assert eclf1.named_estimators.lr is eclf1.named_estimators['lr'] eclf1.fit(X, y) assert 'lr' in eclf1.named_estimators_ assert eclf1.named_estimators_.lr is eclf1.estimators_[0] assert eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'] eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert not hasattr(eclf2, 'nb') assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params()) assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params()) eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert eclf1.estimators[0][1].get_params()['C'] == 10.0 assert eclf2.estimators[1][1].get_params()['max_depth'] == 5 assert_equal(eclf1.get_params()["lr__C"], eclf1.get_params()["lr"].get_params()['C'])
def test_set_estimator_none(): """VotingClassifier set_params should be able to set estimators as None""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=None).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_true(dict(eclf2.estimators)["rf"] is None) assert_true(len(eclf2.estimators_) == 2) assert_true( all([ not isinstance(est, RandomForestClassifier) for est in eclf2.estimators_ ])) assert_true(eclf2.get_params()["rf"] is None) eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = ('All estimators are None. At least one is required' ' to be a classifier!') assert_raise_message(ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=None).fit(X1, y1) assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_estimator_weights_format(): # Test estimator weights inputs as list and array clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft") eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft") eclf1.fit(X, y) eclf2.fit(X, y) assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def classification_results(train,test): #Derivation of NBDriver using training data """ Arguments: train = feature matrix derived from Brown et al. test= feature matrix derived from Martelotto et al. Returns: best_model = Best ensemble model derived using the training data X_red= Dataframe derived after sampling that was used to train the model scores= probability based classification scores """ sen=[];spe=[];acc=[];auc=[];c=[];m=[];s=[] train_x=train.drop('Label',axis=1);train_y=train['Label']; test_x=test.drop('Label',axis=1);test_y=test['Label']; #Random undersampling to reduce the majority class size samp=RepeatedEditedNearestNeighbours(random_state=42) X_samp,y_samp=samp.fit_resample(train_x,train_y) X_samp = pd.DataFrame(X_samp, columns = train_x.columns) #Experimenting with different numbers of top features derived from the tree-based feature extraction method top_n_feats=[30,40,50,60,70] X_r=feature_reduction_using_trees(X_samp,y_samp) cols=X_r.columns for n in top_n_feats: print("For top: ",n," features") X_red=X_r[cols[0:n]] sv=SVC(kernel="linear",probability=True,C=0.01,random_state=42) #chosen from 5foldCV based grid search kde=KDEClassifier(bandwidth=1.27) #chosen from 5foldCV based grid search best_model = VotingClassifier(estimators=[('sv', sv), ('kde', kde)], voting='soft',weights=[4, 7]) #best combination of weights selected by a brute force search (possible weights 1-10) using a cross-validation approach on the training data best_model.fit(X_red,y_samp) y_probs = best_model.predict_proba(test_x[X_red.columns])[:,1] thresholds = arange(0, 1, 0.001) scores = [roc_auc_score(test_y, to_labels(y_probs, t)) for t in thresholds] ix= argmax(scores) y_test_predictions = np.where(best_model.predict_proba(test_x[X_red.columns])[:,1] > thresholds[ix], 2, 1) print("Thresh: ",thresholds[ix]) sensi= sensitivity_score(test_y, y_test_predictions, pos_label=2) speci=specificity_score(test_y,y_test_predictions,pos_label=2) accu=accuracy_score(test_y,y_test_predictions) auro=roc_auc_score(test_y,y_test_predictions) mcc=metrics.matthews_corrcoef(test_y,y_test_predictions) tn, fp, fn, tp = confusion_matrix(test_y, y_test_predictions).ravel() ppv=tp/(tp+fp) npv=tn/(tn+fn) sen=tp/(tp+fn) spe=tn/(tn+fp) score=ppv+npv+sen+spe print("For kmer size: ",len(train.columns[0])) print("for top ",n," features") print(list(X_red.columns.values),"\n") score_dict={"Sen":sen,"Spe":spe,"PPV":ppv,"NPV":npv,"AUC":auro,"MCC":mcc,"ACC":accu} print(score) print(score_dict) df=pd.DataFrame(y_test_predictions) y_samp = pd.DataFrame(y_samp, columns = ['x']) return best_model,X_red,scores
def test_set_estimator_none(): """VotingClassifier set_params should be able to set estimators as None""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=None).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_true(dict(eclf2.estimators)["rf"] is None) assert_true(len(eclf2.estimators_) == 2) assert_true(all([not isinstance(est, RandomForestClassifier) for est in eclf2.estimators_])) assert_true(eclf2.get_params()["rf"] is None) eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = ('All estimators are None. At least one is required' ' to be a classifier!') assert_raise_message( ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=None).fit(X1, y1) assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_parallel_predict(): """Check parallel backend of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1).fit(X, y) eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def test_estimator_weights_format(): # Test estimator weights inputs as list and array clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], weights=[1, 2], voting='soft') eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], weights=np.array((1, 2)), voting='soft') eclf1.fit(X, y) eclf2.fit(X, y) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def my_classifier_predictions(X_train, Y_train, X_test, X_testt, Y_testt): #TODO: complete this clf1 = GradientBoostingClassifier() clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('gbc', clf1), ('gnb', clf3)], voting='soft') eclf = eclf.fit(X_train, Y_train) Y_train_pred = eclf.predict(X_train) print('train:') print(roc_auc_score(Y_train, Y_train_pred)) Y_pred = eclf.predict_proba(X_test)[:, 1] print('test:') Y_predt = eclf.predict(X_testt) print(roc_auc_score(Y_testt, Y_predt)) # parameters1=np.arange(0.5,1,0.01) # score1=[] # score2=[] # for parameter1 in parameters1: # clf=GradientBoostingClassifier(subsample=parameter1,max_features=28,max_depth=3,learning_rate=0.16,n_estimators=60,random_state=RANDOM_STATE) # clf=clf.fit(X_train,Y_train) # score_train=clf.score(X_train,Y_train) # score_test=clf.score(X_test,Y_test) # score1.append(score_train) # score2.append(score_test) # print(parameter1) # print(score1) # print(score2) # return 0 # best_params_ return Y_pred.flatten()
def test_voting_classifier_set_params(): # check equivalence in the output when setting underlying estimators clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]).fit(X, y) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert eclf2.estimators[0][1].get_params() == clf1.get_params() assert eclf2.estimators[1][1].get_params() == clf2.get_params()
class VotingEnsemble(BaseEnsembleModel): def __init__(self, learners, weights=None, random_drop_rate=0.5): super(VotingEnsemble, self).__init__(learners) self.weights = weights self.learners = learners self.random_drop_rate = random_drop_rate self.classifier = VotingClassifier(estimators=[ ('{}_{}'.format(learner.__class__, i), learner) for i, learner in enumerate(learners) ], voting='soft', weights=weights) def _fit(self, X, y): X_, _, y_, _ = train_test_split(X, y, test_size=self.random_drop_rate, random_state=random.randint(1, 100000)) self.classifier.fit(X_, y_) def _predict(self, X): return self.classifier.predict(X) def _predict_proba(self, X): return self.classifier.predict_proba(X)
class VotingClassifierImpl(): def __init__(self, estimators=None, voting='hard', weights=None, n_jobs=None, flatten_transform=True): self._hyperparams = { 'estimators': estimators, 'voting': voting, 'weights': weights, 'n_jobs': n_jobs, 'flatten_transform': flatten_transform } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def voting(X_train, X_test, y_train, y_test, estimators): # If model is already saved, load it and test it if os.path.isfile("./Models/votingModel.pkl"): ensemble = joblib.load('./Models/votingModel.pkl') predictions = ensemble.predict_proba(X_test) fpr, tpr, thresholds = roc_curve(y_test, predictions[:,1]) auc_score = auc(fpr, tpr) train_acc = ensemble.score(X_train, y_train) acc = ensemble.score(X_test, y_test) print("Voting Ensemble Training Acc:", train_acc) print("Voting Ensemble Acc:", acc) print("Voting Ensemble AUC:", auc_score) return ensemble # Else create, train, save model and test model else: ensemble = VotingClassifier(estimators, voting='soft') ensemble.fit(X_train, y_train) joblib.dump(ensemble, "./Models/votingModel.pkl") predictions = ensemble.predict_proba(X_test) fpr, tpr, thresholds = roc_curve(y_test, predictions[:,1]) auc_score = auc(fpr, tpr) acc = ensemble.score(X_test, y_test) print("Voting Ensemble Acc:", acc) print("Voting Ensemble AUC:", auc_score) return ensemble
def process_cell(self, df_cell_train, df_cell_test, window): place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', ], axis=1).values.astype(int) X_test = df_cell_test.values.astype(int) # Applying the classifier clf1 = KNeighborsClassifier(n_neighbors=50, weights='distance', metric='manhattan') clf2 = RandomForestClassifier(n_estimators=50, n_jobs=-1) eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft') eclf.fit(X, y) y_pred = eclf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def process_one_cell(df_train, df_test, grid_id, th): """ Classification inside one grid cell. """ # Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int) # Applying the classifier clf1 = KNeighborsClassifier(n_neighbors=25, weights='distance', metric='manhattan') clf2 = RandomForestClassifier(n_estimators=30, n_jobs=-1) eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft') eclf.fit(X, y) y_pred = eclf.predict_proba(X_test) pred_labels = le.inverse_transform( np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def ensemble(X_train, X_test, y_train): alg1 = linear_model.LogisticRegression() alg2 = svm.SVC(probability=True) alg3 = GaussianNB() alg4 = KNeighborsClassifier(n_neighbors=5) alg5 = MLPClassifier(hidden_layer_sizes=(30, 30, 30)) alg6 = GradientBoostingClassifier() estimators = [] estimators.append(('logistic', alg1)) estimators.append(('svm', alg2)) estimators.append(('Gussian', alg3)) estimators.append(('KNeighbors', alg4)) estimators.append(('MLP', alg5)) estimators.append(('grad', alg6)) ensemble = VotingClassifier(estimators, voting='soft', weights=[1, 1, 2, 2, 2, 2]) ensemble.fit(X_train, y_train) predictions = ensemble.predict(X_test) y_prob = ensemble.predict_proba(X_test) return predictions, y_prob
class VotingEnsembler(BaseModel): """Class that combines models using a voting method. A hard voting method is equivalent to a majority voting. A soft voting returns the class with the highest probability (calculated as the sum of the probabilities predicted by each model)""" def __init__(self, configs, score_method, predict_as_probability, voting_method): BaseModel.__init__(self, configs, score_method, predict_as_probability) self.models_ = [] self.model_weights_ = [] self.voting_method_ = voting_method self.ensemble_model_ = None def init(self): """Method responsible for the ensembler initialization""" if not self.register_models(): return False self.ensemble_model_ = VotingClassifier(estimators=self.models_,\ voting=self.voting_method_, weights=self.model_weights_) return True def register_models(self): """Method used to register the prediction models""" models = self.configs_['pipeline_models'] for i in range(0, len(models)): new_model = PipelineModel(models[i], self.score_method_, self.predict_as_probability_) if not new_model.init(): print 'Error registering model', models[i]['model']['id'],\ 'in ensemble' return False print 'Model', new_model.get_name(), 'registered in ensemble' self.models_.append( (models[i]['model']['id'], new_model.get_sklearn_pipeline())) if 'weight' not in models[i]: self.model_weights_.append(1) else: self.model_weights_.append(models[i]['weight']) return True def get_name(self): """Get the label associated with a model (used for printing)""" if 'label' in self.configs_: return self.configs_['label'] return self.configs_['id'] def fit(self, input_data, targets): """Train the ensemble model""" self.ensemble_model_.fit(input_data, targets) def predict(self, input_data): """Predict the results of an ensemble model""" if not self.predict_as_probability_: return self.ensemble_model_.predict(input_data) else: return self.ensemble_model_.predict_proba(input_data)
def test_pipeline_voting_tfidf_svc(self): pipe1 = Pipeline([ ('tfidf1', TfidfVectorizer()), ('svc', SVC(probability=True, kernel='linear'))]) pipe2 = Pipeline([ ('tfidf2', TfidfVectorizer(norm='l2', use_idf=False)), ('sgd', SGDClassifier(alpha=0.0001, penalty='l2', loss='modified_huber'))]) pipe3 = Pipeline([ ('tfidf3', TfidfVectorizer()), ('mnb', MultinomialNB())]) voting = VotingClassifier( [('p1', pipe1), ('p2', pipe2), ('p3', pipe3)], voting='soft', flatten_transform=False) data = numpy.array(["first sentance", "second sentence", "many sentances", "dummy sentance", "no sentance at all"]) y = numpy.array([0, 0, 1, 0, 1]) voting.fit(data, y) expected_label = voting.predict(data) expected_proba = voting.predict_proba(data) df = pandas.DataFrame(data) df.columns = ['text'] model_onnx = convert_sklearn( voting, initial_types=[('text', StringTensorType([None, 1]))], target_opset=TARGET_OPSET, options={id(voting): {'zipmap': False}}) # with open("debug.onnx", "wb") as f: # f.write(model_onnx.SerializeToString()) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'text': data.reshape((-1, 1))}) assert_almost_equal(expected_proba, got[1], decimal=5) assert_almost_equal(expected_label, got[0])
def voting_model(X_train, X_test, y_train): vclf = VotingClassifier(estimators=[('xgb', xgb), ('rf', rf), ('lgb', lgb)], voting='soft', weights=[1, 1, 1]) vclf.fit(X_train, y_train) predictions = vclf.predict_proba(X_test)[:, 1] return predictions
def test_parallel_fit(): """Check parallel backend of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier( estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1 ).fit(X, y) eclf2 = VotingClassifier( estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2 ).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def test_notfitted(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='soft') ereg = VotingRegressor([('dr', DummyRegressor())]) msg = ("This %s instance is not fitted yet. Call \'fit\'" " with appropriate arguments before using this estimator.") with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'): eclf.predict(X) with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'): eclf.predict_proba(X) with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'): eclf.transform(X) with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'): ereg.predict(X_r) with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'): ereg.transform(X_r)
def ensembleClassifier(models_folder, data_folder, save_suffix, extension, x=None, y_true=None, x_test=None, y_test=None, save=True): if (x == None): x, y_true, x_test, y_test = load_data(data_folder, save_suffix, extension) print("loading models") Gradient_Boost_CV = pickle.load( open("{}{}{}{}".format(models_folder, "GBCV", save_suffix, extension), 'rb')) Logistic_CV = pickle.load( open("{}{}{}{}".format(models_folder, "LRCV", save_suffix, extension), 'rb')) Forest_CV = pickle.load( open("{}{}{}{}".format(models_folder, "RFCV", save_suffix, extension), 'rb')) SVMCV = pickle.load( open("{}{}{}{}".format(models_folder, "SVMCV", save_suffix, extension), 'rb')) gb_param = Gradient_Boost_CV.best_params_ lr_param = Logistic_CV.best_params_ rf_param = Forest_CV.best_params_ sv_param = SVMCV.best_params_ gb = GradientBoostingClassifier(**gb_param) rf = RandomForestClassifier(**rf_param) lr = LogisticRegression(**lr_param) sv = SVC(**sv_param) classifier = VotingClassifier(estimators=[('gb', gb), ('lr', lr), ('rf', rf), ('sv', sv)], voting="soft") print("start fitting my model...") classifier.fit(x, y_true) if (save): with open( "{}Ensembler{}{}".format(models_folder, save_suffix, extension), 'wb') as f: pickle.dump(classifier, f) "finishing off..." test_score = classifier.score(x_test, y_test) y_pred = classifier.predict(x_test) y_pred_proba = classifier.predict_proba(x_test) print(test_score) return test_score, y_pred, y_pred_proba
def process_one_cell(df_train, df_test, x_min, x_max, y_min, y_max): x_border_augment = 0.025 y_border_augment = 0.0125 #Working on df_train df_cell_train = df_train[(df_train['x'] >= x_min-x_border_augment) & (df_train['x'] < x_max+x_border_augment) & (df_train['y'] >= y_min-y_border_augment) & (df_train['y'] < y_max+y_border_augment)] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] #Working on df_test # to be delete: df_cell_test = df_test.loc[df_test.grid_cell == grid_id] df_cell_test = df_test[(df_test['x'] >= x_min) & (df_test['x'] < x_max) & (df_test['y'] >= y_min) & (df_test['y'] < y_max)] row_ids = df_cell_test.index if(len(df_cell_train) == 0 or len(df_cell_test) == 0): return None, None #Feature engineering on x and y df_cell_train.loc[:,'x'] *= fw[0] df_cell_train.loc[:,'y'] *= fw[1] df_cell_test.loc[:,'x'] *= fw[0] df_cell_test.loc[:,'y'] *= fw[1] #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values.astype(float) if 'place_id' in df_cell_test.columns: cols = df_cell_test.columns cols = cols.drop('place_id') X_test = df_cell_test[cols].values.astype(float) else: X_test = df_cell_test.values.astype(float) #Applying the classifier # clf = KNeighborsClassifier(n_neighbors=26, weights='distance', # metric='manhattan') clf1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=26, weights='distance', metric='manhattan'), n_jobs=-1, n_estimators=50) clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard') eclf.fit(X, y) y_pred = eclf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones( (len(y), ))) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y), )) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) # check that an error is raised and indicative if sample_weight is not # supported. clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator KNeighborsClassifier does not support ' 'sample weights.') with pytest.raises(ValueError, match=msg): eclf3.fit(X, y, sample_weight) # check that _parallel_fit_estimator will raise the right error # it should raise the original error if this is not linked to sample_weight class ClassifierErrorFit(ClassifierMixin, BaseEstimator): def fit(self, X, y, sample_weight): raise TypeError('Error unrelated to sample_weight.') clf = ClassifierErrorFit() with pytest.raises(TypeError, match='Error unrelated to sample_weight'): clf.fit(X, y, sample_weight=sample_weight)
def main(): train_path = csvPath + "train\\"; test_path = csvPath +"test\\" data = getTrainY() # Read Train Data train_id, Xtrain_data, damaged_train_Image, Ytrain = prepareData(train_path, data) print "Read Train Data" # Generated Train Model surfFeatures, temp = generateSURFFeatures(Xtrain_data) centroids, histo_tr = generateTrainKmeans(surfFeatures, temp) print "Generated Train Model" # Read Test Data test_id, test_data, damaged_test_Image = prepareTestData(test_path) print "Read Test Data" surfTestFeatures, temp1 = generateSURFFeatures(test_data) histo_te = generateTestKmeans(surfTestFeatures, centroids,temp1 ) print "Generated Test Model" # Classifier clf3 = SVC(probability=True, decision_function_shape='ovr') # Scaling data standard_scaler = StandardScaler() svm_tr = standard_scaler.fit(histo_tr) svm_trf = svm_tr.transform(histo_tr) svm_tr1 = standard_scaler.fit(histo_te) svm_tef = svm_tr1.transform(histo_te) clf1 = DecisionTreeClassifier(max_depth = 3) clf2 = KNeighborsClassifier() eclf = VotingClassifier(estimators=[('dt',clf1),('knn',clf2),('svc',clf3)],voting='soft',weights=[2,1,2]) # Cross validation X_train, X_test, y_train, y_test = cross_validation.train_test_split(svm_trf, Ytrain, test_size=0.3, random_state=0) # Fit the data eclf.fit(X_train, y_train) print eclf.score(X_test, y_test) pred_out = [] for j in range(len(test_data)): pred_out.append(eclf.predict_proba([svm_tef[j]])) generateOutputCSV(test_id, pred_out, damaged_test_Image,'output.csv') print "Done!!!!!!"
def voting_process(df_list, label_list, scale=False): random_state = np.random.RandomState(20180213) vt_results = { 'prediction': [], 'probaility': [], 'y_test': [], 'y_score': [] } try: if scale: df_list = [scale_df(df) for df in df_list] print('DF Scaling successful.') except: raise ValueError('Failed to execute DF Scaling.') for x, y in zip(df_list, label_list): try: x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=.2, random_state=random_state) except: raise ValueError('Train/Test split failed.') vt = VotingClassifier(estimators=[ ('basic_log', LogisticRegression()), ('et', ExtraTreesClassifier()), ('ada', AdaBoostClassifier()), ('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier(n_estimators=100, max_depth=5, learning_rate=0.1)) ], voting='soft') weighting = lambda x: 1 if x else 50 vt.fit(x_train, y_train, sample_weight=[weighting(i) for i in y_train]) vt_results['y_test'].append(y_test) vt_results['prediction'].append(vt.predict(x_test)) vt_results['probaility'].append(vt.predict_proba(x_test)[::, 1]) try: vt_results['y_score'].append(vt.decision_function(x_test)) except: vt_results['y_score'].append(vt.predict_proba(x_test)[::, 1]) return vt_results
def test_notfitted(): eclf = VotingClassifier( estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())], voting="soft", ) ereg = VotingRegressor([("dr", DummyRegressor())]) msg = ("This %s instance is not fitted yet. Call 'fit'" " with appropriate arguments before using this estimator.") with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): eclf.predict(X) with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): eclf.predict_proba(X) with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): eclf.transform(X) with pytest.raises(NotFittedError, match=msg % "VotingRegressor"): ereg.predict(X_r) with pytest.raises(NotFittedError, match=msg % "VotingRegressor"): ereg.transform(X_r)
def train_voting_classifier(estimators, X_train, y_train): seed_everything(seed=1903) voting_clf = VotingClassifier(estimators = estimators, voting = 'soft') voting_clf.fit(X_train,y_train) vc_pred = voting_clf.predict_proba(X_test)[:, 1] # This grabs the positive class prediction score = roc_auc_score(y_test, vc_pred) print(f'voting: {score:0.5f}') vc_df = pd.DataFrame(data=[roc_auc_score(y_test, vc_pred)], columns=['Voting Classifier Score'], index=["ROC AUC Score"]) return vc_df, voting_clf, vc_pred
def voteClassification(featureMatrix, targets, testFeatureMatrix ): #voting using SVM, Nearest_neighbours, RandomForest clf1 = svm.SVC(kernel='poly', probability=True, degree=3) clf2 = KNeighborsClassifier(n_neighbors=5) clf3 = RandomForestClassifier(n_estimators=25) clf = VotingClassifier(estimators=[('svm', clf1), ('nei', clf2), ('rf', clf3)], voting='soft', weights=[1, 1, 1]) clf.fit(featureMatrix, targets) return clf.predict_proba(testFeatureMatrix)
def run(): import numpy as np import pandas as pd import seaborn import matplotlib.pyplot as pyplot import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score from sklearn.ensemble import VotingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import ExtraTreesClassifier df = pd.read_table("./data/australian.csv", sep='\s+', header=None) y = df[14] X = df.drop(columns = 14) y.value_counts() # Split features and target into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size = 0.4) # Instantiate the Classifiers estimators = [('GNB', GaussianNB()), ('extc', ExtraTreesClassifier()), ('KNN', KNeighborsClassifier())] clf = VotingClassifier(estimators=estimators, voting='soft') clf.fit(X_train, y_train) # Make predictions for the test set y_pred_test = clf.predict(X_test) # View accuracy score print(classification_report(y_test, y_pred_test)) clf_probs = clf.predict_proba(X_test) # keep probabilities for the positive outcome only clf_probs = clf_probs[:, 1] # calculate scores clf_auc = roc_auc_score(y_test, clf_probs) # summarize scores print('ensemble: ROC AUC=%.3f' % (clf_auc)) print("accuracy_score is %.3f" % (accuracy_score(y_test, y_pred_test, normalize=True))) # calculate roc curves clf_fpr, clf_tpr, _ = roc_curve(y_test, clf_probs) # plot the roc curve for the model pyplot.plot(clf_fpr, clf_tpr, marker='.', label='Ensemble') # axis labels pyplot.xlabel('False Positive Rate') pyplot.ylabel('True Positive Rate') # show the legend pyplot.legend() # show the plot pyplot.show()
def voting_model(self, X_train, X_test, y_train, bst_xgb, bst_forest, bst_gradient, bst_lgb): vclf = VotingClassifier(estimators=[('xgb', bst_xgb), ('rf', bst_forest), ('gbm', bst_gradient), ('lgb', bst_lgb)], voting='soft', weights=[2, 1, 1, 2]) vclf.fit(X_train, y_train) predictions = vclf.predict_proba(X_test)[:, 1] return predictions
def main(): df_train = pd.read_csv('data/train_data.csv') df_valid = pd.read_csv('data/valid_data.csv') df_test = pd.read_csv('data/test_data.csv') feature_cols = [f for f in list(df_train) if "feature" in f] target_col = df_train.columns[-1] X_train = df_train[feature_cols] y_train = df_train[target_col] X_valid = df_valid[feature_cols] y_valid = df_valid[target_col] X_test = df_test[feature_cols] clf1 = LogisticRegression(C=1e-2, penalty='l2', n_jobs=-1) clf2 = RandomForestClassifier(n_jobs=-1, warm_start=True) clf3 = CatBoostClassifier(learning_rate=1e-2) ensemble = VotingClassifier( \ estimators=[('lr', clf1), ('rf', clf2), ('cb', clf3)], \ voting='soft', \ n_jobs=-1) print('Fitting...') start_time = time.time() ensemble.fit(X_train, y_train) print('Fit: {}s'.format(time.time() - start_time)) p_valid = ensemble.predict_proba(X_valid) loss = log_loss(y_valid, p_valid) print('Loss: {}'.format(loss)) p_test = ensemble.predict_proba(X_test) df_pred = pd.DataFrame({'id': df_test['id'], 'probability': p_test[:, 1]}) csv_path = 'predictions/predictions_{}_{}.csv'.format( int(time.time()), loss) df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None) print('Saved: {}'.format(csv_path))
def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones((len(y),))) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y),)) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) # check that an error is raised and indicative if sample_weight is not # supported. clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator KNeighborsClassifier does not support ' 'sample weights.') with pytest.raises(ValueError, match=msg): eclf3.fit(X, y, sample_weight) # check that _parallel_fit_estimator will raise the right error # it should raise the original error if this is not linked to sample_weight class ClassifierErrorFit(BaseEstimator, ClassifierMixin): def fit(self, X, y, sample_weight): raise TypeError('Error unrelated to sample_weight.') clf = ClassifierErrorFit() with pytest.raises(TypeError, match='Error unrelated to sample_weight'): clf.fit(X, y, sample_weight=sample_weight)
def main(argv): trainX = pd.read_csv('trainingData.txt','\t', header = None) trainX.drop(trainX.columns[len(trainX.columns)-1], axis = 1, inplace = True) trainY = pd.read_csv("trainingTruth.txt", header = None, names = ['Y']) df = trainX.join(trainY) index = df.isnull().sum(axis=1) <= 2 df = df[index] df.fillna(df.median(), inplace = True) print(len(df)) #df.dropna(axis=0, inplace=True) # drop the row with NA in training. X = df.iloc[:,0:-1].values Y = df['Y'].values Y_binary = np.ones((len(Y),3)) * (-1) for i in range(3): index = Y == (i+1) Y_binary[index,i] = 1 X_scaled = preprocessing.scale(X) X_PCA = PCA(n_components=30).fit_transform(X_scaled) clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1, n_estimators=20) clf3 = GaussianNB() clf4 = DecisionTreeClassifier(max_depth=4) clf5 = KNeighborsClassifier(n_neighbors=7) clf6 = SVC(kernel='rbf', probability=True) clf7 = AdaBoostClassifier(random_state=1) testX = pd.read_csv('testData.txt','\t', header = None) testX.drop(testX.columns[len(testX.columns)-1], axis = 1, inplace = True) testX.fillna(testX.median(), inplace = True) # Handle NA in test data, although not necessary for this assignment. testX_scaled = preprocessing.scale(testX) testX_PCA = PCA(n_components=30).fit_transform(testX_scaled) proba = np.zeros((len(testX),3)) for i in range(3): eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('dt', clf4), ('kn', clf5), ('svc', clf6)], voting='soft').fit(X_PCA,Y_binary[:,i]) proba[:,i] = eclf.predict_proba(testX_PCA)[:,1] # Write to file results = pd.DataFrame(proba) results['prediction'] = np.argmax(proba, axis=1) + 1 results.to_csv('testY.txt', sep='\t', header = False, index = False) print(results.iloc[0:10,:])
def voting_classifier(files, var): res_df = list() fpath = files['path'] datasets = pd.read_csv(fpath) datasets = datasets.dropna() datasets.drop_duplicates(inplace=True) dsets = shuffle(datasets) if int(var) == 1: d_train, d_test, l_train, l_test = model_selection.train_test_split(datasets['text'],datasets['spam'],test_size=0.33, random_state=42) dtrain_msg = features_transform(mail=d_train, dtrain=d_train, var1='VOTE') n_est1 = hp.BAG_class(files) n_est2 = hp.RFC_class(files) n_est3 = hp.AB_class(files) alp_dict = hp.MNB_class(files) bag_class = BaggingClassifier(n_estimators=100) model_rf=RandomForestClassifier(n_estimators=20,criterion='entropy') ada_class = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=62) modelMNB = naive_bayes.MultinomialNB() eclf = VotingClassifier(estimators=[('BgC', bag_class), ('RF', model_rf), ('Ada', ada_class), ('MNB', modelMNB) ], voting='soft') train_classifier(eclf, dtrain_msg, l_train, typ="VOTE") eclf.fit(dtrain_msg, l_train) pred_train = eclf.predict(dtrain_msg) mnb_dict = model_assessment(u_classify='Voting EM', y_data=l_train, predicted_class=pred_train) return mnb_dict elif int(var) == 2: print("Inside training phase : ") d_test = dsets['text'] eclf = load(open('VOTE.pkl', 'rb')) vect = load(open('vectVOTE.pkl', 'rb')) tf = TfidfTransformer() load_vect = CountVectorizer(vocabulary=vect) last = len(d_test) for i in range(0, last): if d_test.get(i) != None: tup = [d_test[i],] dtest_msg = tf.fit_transform(load_vect.fit_transform(tup)) pred_test = eclf.predict(dtest_msg) pred = eclf.predict_proba(dtest_msg) res_df.append((i+1, [pred[0][0], pred[0][1], pred_test[0]])) df = pd.DataFrame.from_items(res_df, orient='index', columns=['Class O', 'Class 1', 'Result']) print(df.head(15)) return df else: d_train, d_test, l_train, l_test = model_selection.train_test_split(datasets['text'],datasets['spam'],test_size=0.33, random_state=42) eclf = load(open('VOTE.pkl', 'rb')) vect = load(open('vectVOTE.pkl', 'rb')) tf = TfidfTransformer() load_vect = CountVectorizer(vocabulary=vect) dtest_msg = tf.fit_transform(load_vect.fit_transform(d_test)) pred_test = eclf.predict(dtest_msg) mnb_dict = model_assessment(u_classify='Voting EM', y_data=l_test, predicted_class=pred_test) return mnb_dict
def voting_class(X,training_target,Y): from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') eclf.fit(X[:,0:6],training_target) proba = eclf.predict_proba(Y[:,0:6]) eclf.predict()
def all_classifer(X_train,y_train,X_test,y_test): rf=RandomForestClassifier(n_estimators=100,class_weight ='balanced') score1=scores(y_test,rf.fit(X_train,y_train).predict(X_test),rf.predict_proba(X_test)[:,1],'RT') gbc = GradientBoostingClassifier(n_estimators=50,learning_rate=0.05).fit(X_train,y_train) score2=scores(y_test,gbc.fit(X_train,y_train).predict(X_test),gbc.predict_proba(X_test)[:,1],'gbc') ets=ExtraTreesClassifier(n_estimators=100,max_depth=None,min_samples_split=1,random_state=0) score3=scores(y_test,ets.fit(X_train,y_train).predict(X_test),ets.predict_proba(X_test)[:,1],'ets') # lgr = LogisticRegression() # score4=scores(y_test,lgr.fit(X_train,y_train).predict(X_test),'lgr') ab = AdaBoostClassifier(algorithm='SAMME.R',n_estimators=50,learning_rate=0.7) score5=scores(y_test,ab.fit(X_train,y_train).predict(X_test),ab.predict_proba(X_test)[:,1],'abboost') # print roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]) # bagging=BaggingClassifier() # score8=scores(y_test,bagging.fit(X_train,y_train).predict(X_test),'bagging') # dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1,random_state=0) # score6=scores(y_test,dt.fit(X_train,y_train).predict(X_test),'dt') eclf = VotingClassifier(estimators=[ ('rf', rf), ('gd',gbc),('ETs',ets),('ab',ab)], voting='soft',weights =[score1[0],score2[0],score3[0],score5[0]]) score7=scores(y_test,eclf.fit(X_train,y_train).predict(X_test),eclf.predict_proba(X_test)[:,1],'voting') print eclf return [score1,score2,score3,score5,score7]
class VtClassifier(Model): ''' Voting Classfier ''' def __init__(self, *args): Model.__init__(self) self.modelIndex = ['GNB', 'SVClassifier', 'LRModel', 'ABClassifier', 'GBClassifier'] self.models = [] self.estimators = [] for arg in args: index = self.modelIndex.index(arg) if index == 0: self.models.append(Model()) self.estimators.append((arg, Model().model)) elif index == 1: self.models.append(SVClassifier()) self.estimators.append((arg, SVClassifier().model)) elif index == 2: self.models.append(LRModel()) self.estimators.append((arg, LRModel().model)) elif index == 3: self.models.append(ABClassifier()) self.estimators.append((arg, ABClassifier().model)) elif index == 4: self.models.append(GBClassifier()) self.estimators.append((arg, GBClassifier().model)) self.model = VotingClassifier(estimators=self.estimators, voting='hard') def train(self, data, target): for model in self.models: model.train(data, target) self.model.fit(data, target) def predict(self, test): return self.model.predict_proba(test)
clf.predict(test_[cols]) preds = clf.predict_proba(test_[cols]) #print(confusion_matrix(test['class'], clf.predict(test[cols]))) print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"])) print (classification_report(test_['TripType'], clf.predict(test_[cols]))) score=accuracy_score(test_['TripType'],clf.predict(test_[cols])) # table.append([name,score]) print (score) ''' clf= VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))), ('RandomForest', RandomForestClassifier(10)), ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))], voting='soft', weights=[7,1,1]) clf.fit(train_[cols], train_["TripType"]) clf.predict(test_[cols]) preds = clf.predict_proba(test_[cols]) #print(confusion_matrix(test['class'], clf.predict(test[cols]))) print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"])) print (classification_report(test_['TripType'], clf.predict(test_[cols]))) score=accuracy_score(test_['TripType'],clf.predict(test_[cols])) #table.append([score]) print (score) eclf = VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))), ('BaggingRandomForest', RandomForestClassifier(10)), ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))], voting='soft', weights=[7,1,1]) eclf.fit(train[cols], train["TripType"]) #use the classifier to predict predicted=eclf.predict(test[cols]) #print (accuracy_score(predicted,test['TripType']))
''' #################################### clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(n_estimators=200,max_depth = 15,random_state=1) clf3 = GaussianNB() clf4 = xgb.XGBClassifier(missing=np.nan, max_depth=15, n_estimators=200, learning_rate=0.02, nthread=16, subsample=0.95, colsample_bytree=0.85, seed=4242) clf5 = AdaBoostClassifier(n_estimators=300, learning_rate=0.02,random_state=1) eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('xgb', clf4),('adb',clf5)], voting='soft') print("fitting..") eclf1 = eclf1.fit(X_train, y_train) print("predicting..") rfpreds = eclf1.predict_proba(X_test) print("arrived at verdict..") ################################### x,y,thresholds =roc_curve(y_test,rfpreds[:,1],1) plt.figure() plt.plot(x,y) plt.show() print (auc(x,y)) bestMCCR =0 for threshold in thresholds: predicted = rfpreds[:,1] > threshold CCR1, CCR2, mCCR = MCCR(predicted,y_test,0,1); bestMCCR = max(bestMCCR,mCCR)
param1 = {'max_depth':7, 'learning_rate':0.1, 'silent':0, 'objective':'multi:softprob','num_class':5, 'eval_metric':'mlogloss','subsample':0.75,'colsample_bytree':0.85,'reg_lambda':1,'n_estimators':num_round} param2 = {'max_depth':6, 'learning_rate':0.1, 'silent':0, 'objective':'multi:softprob','num_class':5, 'eval_metric':'mlogloss','subsample':0.85,'colsample_bytree':0.75,'reg_lambda':1,'n_estimators':num_round} param3 = {'max_depth':8, 'learning_rate':0.03, 'silent':0, 'objective':'multi:softprob','num_class':5, 'eval_metric':'mlogloss','subsample':0.65,'colsample_bytree':0.75,'reg_lambda':1,'n_estimators':num_round} param4 = {'max_depth':9, 'learning_rate':0.03, 'silent':0, 'objective':'multi:softprob','num_class':5, 'eval_metric':'mlogloss','subsample':0.55,'colsample_bytree':0.65,'reg_lambda':1,'n_estimators':num_round} param5 = {'max_depth':12, 'learning_rate':0.03, 'silent':0, 'objective':'multi:softprob','num_class':5, 'eval_metric':'mlogloss','subsample':1,'colsample_bytree':1,'reg_lambda':1,'n_estimators':num_round} bst1 = xgb.XGBClassifier(param1) bst2 = xgb.XGBClassifier(param2) bst3 = xgb.XGBClassifier(param3) bst4 = xgb.XGBClassifier(param4) bst5 = xgb.XGBClassifier(param5) # prob = (bst1.predict(test) + bst2.predict(test) + bst3.predict(test) + bst4.predict(test) + bst5.predict(test))/5 model = VotingClassifier(estimators=[('xgb1',bst1),('xgb2',bst2),('xgb3',bst3),('xgb4',bst4),('xgb5',bst5)],voting='soft') model.fit(X,y) prob = model.predict_proba(test) sub = pd.DataFrame(prob,columns=['Adoption','Died','Euthanasia','Return_to_owner','Transfer']) sub['ID'] = IDs sub = sub[['ID','Adoption','Died','Euthanasia','Return_to_owner','Transfer']] sub.to_csv('sub.csv',index=False)
def fit(self): clf_list=[] # # KNN # print "KNN" # knn = KNeighborsClassifier(n_neighbors=35, weights='distance', leaf_size=2) # print "Fitting KNN" # knn.fit(self.X_train, self.y_train) # print('KNN {score}'.format(score=log_loss(self.y_test, knn.predict_proba(self.X_test)))) # self.clfs['knn'] = knn # clf_list.append(knn) # Random forests print "Random forest on gini" rfc = RandomForestClassifier(n_estimators=43, criterion='gini', random_state=4141, n_jobs=-1, max_depth=21, max_features=0.12) print "Fitting random forest with gini" rfc.fit(self.X_train, self.y_train) print('RFC LogLoss {score}'.format(score=log_loss(self.y_test, rfc.predict_proba(self.X_test)))) self.clfs['rfc']=rfc clf_list.append(rfc) print "Random forest with entropy" rfc2 = RandomForestClassifier(n_estimators=80, criterion='entropy', random_state=1337, n_jobs=-1, max_depth=36, max_features=0.06) print "Fitting random forest with entropy" rfc2.fit(self.X_train, self.y_train) print('RFC2 LogLoss {score}'.format(score=log_loss(self.y_test, rfc2.predict_proba(self.X_test)))) self.clfs['rfc2']=rfc2 clf_list.append(rfc2) # Logistic regression print "Logistic regression on logloss" logreg = LogisticRegression(C=1.05, penalty='l2') print "Fitting logistic regression" logreg.fit(self.X_train, self.y_train) print('LR LogLoss {score}'.format(score=log_loss(self.y_test, logreg.predict_proba(self.X_test)))) self.clfs['lr']=logreg clf_list.append(logreg) # # gradient boosting # gbt1=GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth = 1, random_state = 0) # print "Fitting gradient boosting tree" # gbt1.fit(self.X_train, self.y_train) # print('Gbt1 LogLoss {score}'.format(score=log_loss(self.y_test, gbt1.predict_proba(self.X_test)))) # self.clfs['gbt1']=gbt1 # clf_list.append(gbt1) # # Bad performance # # Multinomial Naive Bayes # print "Multinomial naive bayes" # mnb = MultinomialNB(fit_prior=False,alpha=0.25) # print "Fitting multinomial naive bayes" # mnb.fit(self.X_train, self.y_train) # print('MNB {score}'.format(score=log_loss(self.y_test, mnb.predict_proba(self.X_test)))) # self.clfs['mnb'] = mnb # clf_list.append(mnb) # Adaboost print "Adaboost trees" abc = AdaBoostClassifier(n_estimators=100,learning_rate=0.5) print "Fitting Adaboost trees" abc.fit(self.X_train, self.y_train) print('ABC {score}'.format(score=log_loss(self.y_test, abc.predict_proba(self.X_test)))) self.clfs['abc'] = abc clf_list.append(abc) # Ensemble to models eclf3 = VotingClassifier(estimators=[('lr', logreg), ('rf', rfc), ('rf2', rfc2),('abc',abc)], voting='soft', weights=[2, 2, 2, 1]) eclf3.estimators_ = clf_list print "Dig into the voting classifier" innerClfs = eclf3.estimators_ print "Check estimators" print innerClfs print('Ensemble LogLoss {score}'.format(score=log_loss(self.y_test, eclf3.predict_proba(self.X_test)))) self.ensembleClf=eclf3 print "Ensemble fitting finished"
seed=23) XGBClassifier2 = xgb.XGBClassifier(objective='binary:logistic', missing=9999999999, max_depth=8, n_estimators=1000, learning_rate=0.05, nthread=4, subsample=0.8, colsample_bytree=0.5, min_child_weight=8, seed=1313) #0.825461 classifier = VotingClassifier([('clf1', XGBClassifier1), ('clf2', XGBClassifier2)], voting='soft', weights=[1, 1]) classifier.fit(X_train, y_train) testingPreds=classifier.predict_proba(sel_test); submission = pd.DataFrame({"ID":test.index, "TARGET":testingPreds[:,1]}) submission.to_csv("XGBoostEnsembled.csv", index=False) # mapFeat = dict(zip(["f"+str(i) for i in range(len(features))],features)) # ts = pd.Series(clf.booster().get_fscore()) # #ts.index = ts.reset_index()['index'].map(mapFeat) # ts.sort_values()[-15:].plot(kind="barh", title=("features importance")) # # featp = ts.sort_values()[-15:].plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) # plt.title('XGBoost Feature Importance') # fig_featp = featp.get_figure() # fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)
xgb2 = xgb.XGBClassifier(max_depth=11, n_estimators=100, learning_rate=0.03, subsample=0.96, colsample_bytree=0.45, colsample_bylevel=0.45, objective='binary:logistic', nthread=4, seed=1313) #score = log_loss(y_test, extc.predict_proba(X_test)[:, 1]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, target, random_state=1301, test_size=0.3) clfs = [('etc', etc1), ('rf', rf1), ('xgb', xgb1), ('etc2', etc2)] # # set up ensemble of rf_1 and rf_2 clf = VotingClassifier(estimators=clfs, voting='soft', weights=[1, 1, 1, 1]) st = time.time() scores = cross_validation.cross_val_score(clf, X_train, y_train, scoring='log_loss', cv=5, verbose=2) print(scores.mean()*-1) print("time elaspe", time.time() - st) exit() clf.fit(train, target) print('Predict...') y_pred = clf.predict_proba(test) # print y_pred pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:, 1]}).to_csv('data/extra_trees_1_7.csv', index=False)
bagged_rf.fit(X_train, y_train) print "bagged rf test",roc_auc_score(y_test, bagged_rf.predict_proba(X_test)[:,1]) #print "bagged rf train",roc_auc_score(y_train, bagged_rf.predict_proba(X_train)[:,1]) '''print "Calibrating Bagged Decision Trees..." calibrated_dt.fit(X_train, y_train) print "calibrated_dt test:", roc_auc_score(y_test, calibrated_dt.predict_proba(X_test)[:,1]) print "Calibrating Bagged Random Forests..." calibrated_rf.fit(X_train, y_train) print "calibrated_rf test:", roc_auc_score(y_test, calibrated_rf.predict_proba(X_test)[:,1]) ''' print "Voting with all models...." voted_model = VotingClassifier(estimators=[('one', ada), ('two', bagged_rf), ('four', bagged_dt)], voting='soft') voted_model.fit(X_train, y_train) print "Voted Model test:",roc_auc_score(y_test, voted_model.predict_proba(X_test)[:,1]) #print "Voted Model train",roc_auc_score(y_train, voted_model.predict_proba(X_train)[:,1]) ####Loading test file and saving predictions print "Saving Voted Submission" X_test = np.genfromtxt ('test_normal_286.csv', delimiter=",") ncounts = np.zeros((X_test.shape[0], 1)) for i in range(0, X_test.shape[0]): ncounts[i, 0] = (X_test[i, :] == 0).sum(0) X_test = np.append(X_test, ncounts, axis = 1) categories_test = clusters.predict(X_test) cats = np.zeros((len(categories_test), 1)) for i in range(0, cats.shape[0]): cats[i, 0] = categories_test[i]
def main(argv): f = open("trainingData.txt") cols = [] for col_i in range(N_COLS): col = [] cols.append(col) rows = [] rows_na = [] n_na = 0 curr_n_na = 0 freq_dict = {} while True: row = f.readline() if row == "": break features = [float(number) if number != 'NA' else PLACEHOLDER for number in row.split()] curr_n_na = 0 for col_i in range(N_COLS): if features[col_i] != PLACEHOLDER: cols[col_i].append(features[col_i]) else: n_na += 1 curr_n_na += 1 rows.append(features) rows_na.append(curr_n_na) if curr_n_na in freq_dict: freq_dict[curr_n_na] += 1 else: freq_dict[curr_n_na] = 1 f.close() print ("NA distribution: ",freq_dict) print ("Total # of NA:", n_na) medians = [] for col_i in range(N_COLS): medians.append(statistics.median(cols[col_i])) for i, features in enumerate(rows): for j, feature in enumerate(features): if feature == PLACEHOLDER: rows[i][j] = medians[j] X = np.array(rows) f = open("trainingTruth.txt") rows = [] while True: row = f.readline() if row == "": break rows.append(int(row)) f.close() Y = np.array(rows) print ("# of each label:", np.bincount(Y)) take = [] for i in range(X.shape[0]): if Y[i] == 1 and rows_na[i] == 0: take.append(i) elif Y[i] == 2 and rows_na[i] == 0: take.append(i) elif Y[i] == 3 and rows_na[i] <= 1: take.append(i) X = X[take] Y = Y[take] print ("# of each label after normalization:", np.bincount(Y)) X_scaled = preprocessing.scale(X) X_PCA = PCA(n_components=3).fit_transform(X_scaled) clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1, n_estimators=20) clf3 = GaussianNB() clf4 = DecisionTreeClassifier(max_depth=4) clf5 = KNeighborsClassifier(n_neighbors=7) clf6 = SVC(kernel='rbf', probability=True) estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('dt', clf4), ('kn', clf5), ('svc', clf6)] eclf = VotingClassifier(estimators, voting='soft').fit(X_PCA,Y) testX = pd.read_csv('testData.txt','\t', header = None) testX.drop(testX.columns[len(testX.columns)-1], axis = 1, inplace = True) # testX.fillna(testX.median(), inplace = True) # Handle NA in test data, although not necessary for this assignment. testX_scaled = preprocessing.scale(testX) testX_PCA = PCA(n_components=3).fit_transform(testX_scaled) proba = eclf.predict_proba(testX_PCA) prediction = eclf.predict(testX_PCA) # Write to file results = pd.DataFrame(proba) results['prediction'] = prediction results.to_csv('testY_1114.txt', sep='\t', header = False, index = False) # results['prediction'].to_csv('testY_1114.txt', sep='\t', header = False, index = False) print(results.iloc[0:10,:]) return for i, estimator in enumerate(estimators): print (i) curr_clf = estimator[1] curr_clf.fit(X_PCA, Y) proba = curr_clf.predict_proba(testX_PCA) prediction = curr_clf.predict(testX_PCA) results = pd.DataFrame(proba) results['prediction'] = prediction print(results.iloc[0:10,:])
class VotingWeightSearchCV(BaseEstimator, ClassifierMixin, TransformerMixin): """ Soft voting classifier that chooses weights based on test dataset """ def __init__(self, estimators, test_size=0.33, starting_weights=None, verbose=0, random_state=None, refit=False): self.test_size = test_size self.estimators = estimators self.verbose = verbose self.random_state = random_state self.refit = refit if starting_weights is not None: self.starting_weights = starting_weights else: self.starting_weights = [0.5] * len(estimators) self.best_estimator_ = None self.weights_ = None self.peak_score_ = None def _log(self, msg, verbosity=0): if self.verbose >= verbosity: print "{pre} {ind}{msg}".format( pre = "(SW)", ind = "".join([" "] * verbosity), msg = msg ) def fit(self, X, y): """Train and find the optimum weights. https://www.kaggle.com/hsperr/otto-group-product-classification-challenge/finding-ensamble-weights/code https://www.kaggle.com/sushanttripathy/otto-group-product-classification-challenge/wrapper-for-models-ensemble/code """ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = self.test_size, random_state = self.random_state, stratify = y ) fitted_estimators = [] predictions = [] def log_loss_func(weights): final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return log_loss(y_test, final_prediction) # Fit on train set self._log("Fitting on train subset...") for label, clf in self.estimators: self._log("fitting {0}...".format(label), 1) fitted_clf = clone(clf).fit(X_train, y_train) fitted_estimators.append((label, fitted_clf)) # Predict on test set self._log("Predict on test subset...") for label, clf in fitted_estimators: self._log("predict using {0}...".format(label), 1) predictions.append(clf.predict_proba(X_test)) # Search weights self._log("Searching weights...") cons = ({"type": "eq", "fun": lambda w: 1 - sum(w)}) bounds = [(0,1)]*len(predictions) res = minimize( log_loss_func, self.starting_weights, method = "SLSQP", bounds = bounds, constraints = cons ) self.weights_ = list(res["x"]) self.peak_score_ = res["fun"] self._log("Best weights: {0}".format(self.weights_), 1) self._log("Peak score: {0}".format(self.peak_score_), 1) # Build voting classifier self.best_estimator_ = VotingClassifier( estimators = self.estimators, voting = "soft", weights = self.weights_ ) if self.refit: self._log("Refitting using best weights...") self.best_estimator_.fit(X, y) return self def predict(self, X): return self.best_estimator_.predict(X) def predict_proba(self, X): return self.best_estimator_.predict_proba(X) def transform(self, X): return self.best_estimator_.transform(X)