def run_voting_model(X, y, cv_split): # Using default hyper-parameters MLA_dict = get_algorithms() # Removing models without attribute 'predict_proba' (required for vote classifier) # and models with a 1.0 correlation to another model clf_keys = [ "ada", "etc", "gbc", "rfc", "gpc", "lr", "bnb", "gnb", "knn", "svc", "lda", "qda", ] vote_est = [] for clf_name in clf_keys: vote_est.append((clf_name, MLA_dict[clf_name])) # Hard Vote or majority rules vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting="hard") vote_hard_cv = model_selection.cross_validate(vote_hard, X, y, cv=cv_split, return_train_score=True, n_jobs=-1) vote_hard.fit(X, y) print("Hard Voting Training accuracy: {:.2f}".format( vote_hard_cv["train_score"].mean() * 100)) print("Hard Voting Test accuracy: {:.2f}".format( vote_hard_cv["test_score"].mean() * 100)) print("Hard Voting Test 3*std: +/- {:.2f}".format( vote_hard_cv["test_score"].std() * 100 * 3)) print("-" * 10) # Soft Vote or weighted probabilities vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting="soft") vote_soft_cv = model_selection.cross_validate(vote_soft, X, y, cv=cv_split, return_train_score=True, n_jobs=-1) vote_soft.fit(X, y) print("Soft Voting Training accuracy: {:.2f}".format( vote_soft_cv["train_score"].mean() * 100)) print("Soft Voting Test accuracy: {:.2f}".format( vote_soft_cv["test_score"].mean() * 100)) print("Soft Voting Test 3*std: +/- {:.2f}".format( vote_soft_cv["test_score"].std() * 100 * 3)) print("-" * 10) return vote_est, vote_hard_cv, vote_soft_cv
def train_and_evaluate(complete_tag_count, prediction, predicted_class, nonpredicted_class): tag_total = np.array(complete_tag_count) predicted_final = np.array(predicted_class) nonpredicted_final = np.array(nonpredicted_class) #features_total = np.array(features) clf1 = linear_model.LogisticRegression(n_jobs=9) clf2 = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=9) clf3 = ensemble.ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=1, random_state=0, criterion='entropy', n_jobs=9) clf4 = tree.DecisionTreeClassifier(max_depth=3) clf5 = naive_bayes.GaussianNB() clf6 = naive_bayes.BernoulliNB() clf7 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=1, random_state=0) clf8 = ensemble.AdaBoostClassifier(n_estimators=100) clf9 = OneVsRestClassifier(clf4, n_jobs=9) clf10 = svm.SVC(kernel='linear', probability=True, C=0.05) eclf = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('ext', clf3), ('dt', clf4), ('gnb', clf5), ('bnb', clf6), ('gbc', clf7), ('ada', clf8), ('1vr', clf9), ('svc', clf10) ], voting='soft') eclf2 = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('ext', clf3), ('dt', clf4), ('gnb', clf5), ('bnb', clf6), ('gbc', clf7), ('ada', clf8), ('1vr', clf9), ('svc', clf10) ], voting='hard') cv = cross_validation.StratifiedKFold(predicted_final, 10) for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, eclf, eclf2], ['Logistic Regression', 'Random Forest', 'Extra Trees', 'Decision Tree','Gaussian NB', 'Bernoulli NB', 'Gradient Boosting Classifier', 'AdaBoost', 'One vs Rest', 'SVC Linear', 'Soft Voting Ensemble', 'Hard Voting Ensemble']): """scores = cross_validation.cross_val_score(clf, tag_total, predicted_final, cv=cv, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))""" if prediction == 'age': results = cross_validation.cross_val_predict(clf, tag_total, nonpredicted_final, cv=cv) final_tags = [] for i in range(len(tag_total)): user = tag_total[i] user_gender = results[i] if user_gender == 'M' or user_gender == 'MALE': g = 0 elif user_gender == 'F' or user_gender == 'FEMALE': g = 1 user = np.append(user, g) final_tags.append(user) else: final_tags = tag_total scores = cross_validation.cross_val_score(clf, final_tags, predicted_final, cv=cv, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def define_model(modelname): """ Outputs model type and parameters Input ---- model: str model type e.g., Logistic Regression parameters: ls hyperparameters of corresponding model Output ------ clf: model object Model Object Classifier """ if modelname == 'LR': return linear_model.LogisticRegression() elif modelname == 'NN': return neighbors.KNeighborsClassifier() elif modelname == 'DT': return tree.DecisionTreeClassifier() elif modelname == 'RF': return ensemble.RandomForestClassifier() elif modelname == 'NB': return naive_bayes.GaussianNB() elif modelname == 'SVM': return svm.SVC() elif modelname == 'ET': return ensemble.ExtraTreesClassifier() elif modelname == 'SGD': return linear_model.SGDClassifier() elif modelname == 'AB': return ensemble.AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=1) ) elif modelname == 'GB': return ensemble.GradientBoostingClassifier() elif modelname == 'VC': return ensemble.VotingClassifier(estimators=[ ('RFC', ensemble.RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)), ('ETC', ensemble.ExtraTreesClassifier(max_depth=None, max_features=5, n_estimators=10, random_state=0, min_samples_split=1)), ('ABC', ensemble.AdaBoostClassifier())], voting='soft') elif modelname == 'VC2': return ensemble.VotingClassifier(estimators=[ ('LR', linear_model.LogisticRegression(C=0.1, random_state=1)), ('RFC', ensemble.RandomForestClassifier(max_depth=None, n_estimators=10, random_state=0, min_samples_split=1)), ('ETC', ensemble.ExtraTreesClassifier(max_depth=None, max_features=5, n_estimators=10, random_state=0, min_samples_split=1))], voting='soft') else: raise ConfigError("Can't find the model: {}".format(model))
def get_classifiers(feature_extraction_technique, data, label, report): # Four base classifiers and five ensemble classifiers using soft voting. # In order to replicate our experiment, we set the random_state = 0 for the classifiers if necessary. classifiers = [ ('NB', naive_bayes.MultinomialNB(alpha=1.0)), ('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)), ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True)), ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0)), ('SVE1', ensemble.VotingClassifier( estimators=[('NB', naive_bayes.MultinomialNB(alpha=1.0)), ('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)), ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True)), ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0)) ], voting='soft') ), ('SVE2', ensemble.VotingClassifier(estimators=[('NB', naive_bayes.MultinomialNB(alpha=1.0)), ('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)), ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True))], voting='soft')), ('SVE3', ensemble.VotingClassifier( estimators=[('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)), ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True)), ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0))], voting='soft')), ('SVE4', ensemble.VotingClassifier( estimators=[('NB', naive_bayes.MultinomialNB(alpha=1.0)), ('SVM', svm.SVC(kernel='linear', C=1.0, random_state=0, probability=True)), ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0)) ], voting='soft') ), ('SVE5', ensemble.VotingClassifier( estimators=[('NB', naive_bayes.MultinomialNB(alpha=1.0)), ('LR', linear_model.LogisticRegression(C=1.0, max_iter=100, random_state=0)), ('RF', ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0)) ], voting='soft') ) ] for name, model in classifiers: training_and_evaluation(feature_extraction_technique, name, model, data, label, report)
def all_classifiers(): soft_voting_classifiers = [ ('RF', RandomForestClassifier(n_jobs=-1)), ('GB', ensemble.GradientBoostingClassifier()), ('LR', LogisticRegression(n_jobs=-1, solver='saga')) # ('GNB', GaussianNB()) ] hard_voting_classifiers = [('RF', RandomForestClassifier(n_jobs=-1)), ('GB', ensemble.GradientBoostingClassifier()), ('GNB', GaussianNB()), ('LR', LogisticRegression(n_jobs=-1, solver='saga'))] return [ # ('BalancedRandomForest', RandomForestClassifier(max_depth=None, class_weight="balanced", n_jobs=-1)), # ('RandomForest', RandomForestClassifier(max_depth=None, n_jobs=-1)), # ('GradientBoosting', gradient_booster()), # ('AdaBoost', AdaBoostClassifier()), # ('BalancedSVM', svm.SVC(class_weight='balanced')), # ('SVM', svm.SVC()), # ('GaussianNB', GaussianNB()), # ('LogisticRegression', LogisticRegression(n_jobs=-1, solver='saga')), # ('SoftVoting', ensemble.VotingClassifier( # estimators=soft_voting_classifiers, # voting='soft', # n_jobs=-1) # ), ('HardVoting', ensemble.VotingClassifier(estimators=hard_voting_classifiers, voting='hard', n_jobs=-1)), ]
def fit(self, X, y): # Split into categorical,numerical categories: self.cat_clf = pipeline.Pipeline((('cat-tf', CategoricalTransformer()), ('bnb', naive_bayes.BernoulliNB()))) self.num_clf = pipeline.Pipeline( (('num-tf', NumericalTransformer()), ('gnb', naive_bayes.GaussianNB()))) weights_range = [[ a, 1.0 - a ] for a in [0., .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0]] voting_range = ['soft'] param_grid = dict(voting=voting_range, weights=weights_range) print "Using param grid " + str(param_grid) cv = model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) self.clf = ensemble.VotingClassifier( estimators=[('num-clf', self.num_clf), ('cat-clf', self.cat_clf)]) self.clf = model_selection.GridSearchCV(self.clf, param_grid=param_grid, cv=cv, n_jobs=7) self.clf.fit(X, y) print "Best params: " + str( self.clf.best_params_) + " and corresponding score is " + str( self.clf.best_score_)
def model_ensemble(train, test, label): cl1 = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100) cl2 = KNeighborsClassifier(n_neighbors=3) cl3 = sm.LabelPropagation(kernel='rbf', gamma=20, n_neighbors=3, alpha=1, max_iter=100, tol=0.0001) cl4 = svm.SVC(kernel='rbf', probability=True) cl5 = linear_model.LogisticRegression() cl6 = ske.RandomForestClassifier(n_estimators=100, criterion="gini") rf = ske.VotingClassifier(estimators=[('gradient boost', cl1), ('knn', cl2), ('labelprop', cl3), ('svm', cl4), ('logistic reg', cl5), ('rforest', cl6)], voting='soft', weights=[2, 4, 3, 5, 2, 3]) cl1.fit(train[label], train["Survived"]) cl2.fit(train[label], train["Survived"]) cl3.fit(train[label], train["Survived"]) cl4.fit(train[label], train["Survived"]) cl5.fit(train[label], train["Survived"]) cl6.fit(train[label], train["Survived"]) rf.fit(train[label], train["Survived"]) test_predict = pd.DataFrame.copy(test) test_predict["Survived"] = rf.predict(test_predict[label]) return test_predict
def getAndScoreVotingEnsemble(self, trainingDataFrame, predictorColumns, labelColumn, votingMethod="hard"): trainingInputs = trainingDataFrame[predictorColumns] #trainingInputs = preprocessing.normalize(trainingInputs, axis=0) trainingLabels = trainingDataFrame[labelColumn] cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.7, random_state=0) voter = ensemble.VotingClassifier(estimators=self.MLA, voting=votingMethod) voter_cv = model_selection.cross_validate(voter, trainingInputs, trainingLabels, cv=cv_split) voter.fit(trainingDataFrame[predictorColumns], trainingDataFrame[labelColumn]) print("{} Voting Training mean Score: {:.2f}".format( votingMethod, voter_cv['train_score'].mean() * 100)) print("{} Voting Test mean Score: {:.2f}".format( votingMethod, voter_cv['test_score'].mean() * 100)) print("{} Voting Test Score 3*std: +/- {:.2f}".format( votingMethod, voter_cv['test_score'].std() * 100 * 3)) print('-' * 10)
def main(name): data = Data(name) data.readData() model1 = SVC() model2 = RandomForestClassifier(n_estimators=100) model3 = Ensemble.GradientBoostingClassifier(n_estimators=100) model4 = KNeighborsClassifier() model = Ensemble.VotingClassifier(estimators=[('svm', model1), ('rf', model2), ('gb', model3), ('kn', model4)], weights=[3, 2, 2, 1]) grid = GridSearchCV(estimator=model1, param_grid={'C': [0.5, 2, 10]}, cv=5) all_feature = np.concatenate((np.array(data.feature_train, dtype=np.float32), np.array(data.feature_validation, dtype=np.float32))) all_label = np.concatenate((np.array(data.label_train, dtype=np.float32), np.array(data.label_validation, dtype=np.float32))) model3.fit(all_feature, all_label) # model2.fit(data.feature_train, data.label_train) # print "Best params: " , grid.best_estimator_.get_params() ans = model3.predict(data.test) print ans, sum(ans) np.save('gdbt_' + name + '.npy', ans)
def train_classify(train_file, test_file): train_vectors, train_class, test_vectors = feature_generation( train_file, test_file) plot_distribution(train_class, train_file + ' Before sampling') train_vectors, train_class = over_sample(train_vectors, train_class) if test: eclf = ensemble.VotingClassifier(estimators=[ ('nbm', models['Multinomial NB']), ('tree', models['Decision Tree']), ('rf', models['Random Forest']), ('lr', models['Logistic Regression']), ], voting='soft') preds = classify(eclf, train_vectors, train_class, test_vectors) f = open('data/' + candidate + '_predictions' + '.txt', 'w+') for index, pred in enumerate(preds): f.write(str(index + 1) + ';;' + str(preds[index]) + '\n') f.close() else: metrics = [] for index, model in enumerate(models): print "Classifying using", model accScore, precision, recall, f1score = classify( models[model], train_vectors, train_class, test_vectors) metrics.append({}) metrics[index]['Classifier'] = model metrics[index]['accuracy'] = accScore metrics[index]['possitive f1score'] = f1score[0] metrics[index]['negative f1score'] = f1score[1] pd.io.json.json_normalize(metrics).plot(kind='bar', x='Classifier') plt.title(train_file) plt.grid(True, axis='y') plt.ylim(ymax=1) plt.xticks(rotation=0)
def fit_model(X_train, Y_train, X_2, Y_2, X_3, Y_3): """ Learn the classifier, prints metrics""" #Gradient Boosting Classifier gb = Class_Fit(clf=ensemble.GradientBoostingClassifier) param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]} gb.grid_search(parameters=param_grid, Kfold=5) gb.grid_fit(X=X_train, Y=Y_train) # объединяем gb_best = ensemble.GradientBoostingClassifier(**gb.grid.best_params_) votingC = ensemble.VotingClassifier(estimators=[('gb', gb_best)], voting='soft') # и обучаю его votingC = votingC.fit(X_train, Y_train) predictions_baseline = votingC.predict(X_2) # baseline print("____________________") print('Balanced sampling baseline model metrics:') print_result(Y_2, predictions_baseline) predictions_3 = (votingC.predict_proba(X_2)[:, 1] >= 0.8).astype(bool) print('Balanced sampling threshold metrics :') print_result(Y_2, predictions_3) predictions_4 = (votingC.predict_proba(X_3)[:, 1] >= 0.8).astype(bool) print('Only prof threshold metrics:') print_result(Y_3, predictions_4) predictions_5 = votingC.predict(X_3) print('Only prof metrics with baseline classifier:') print_result(Y_3, predictions_5) return votingC
def voting_ensemble(): # Last but not least, let's combine some of these models # To try for better predictive performance n_trees = 100 models = np.empty([2, 2], dtype='object') # Voting ensembles # Number 1: Hard Vote (Predicted class labels used for majority rule voting) models[0] = [ 'Voting Classifier 1', ensemble.VotingClassifier(estimators=[ ('lr', linear_model.LogisticRegression(random_state=1)), ('gbm', ensemble.GradientBoostingClassifier(random_state=1)), ], voting='hard') ] # Number 2: Soft Vote (Argmax of sums of predicted probabilities used) # Recommended for ensemble of well-calibrated classifiers models[1] = [ 'Voting Classifier 2', ensemble.VotingClassifier(estimators=[ ('lda', discriminant_analysis.LinearDiscriminantAnalysis()), ('rf', ensemble.RandomForestClassifier(random_state=1, n_estimators=n_trees, max_features=3)) ], voting='soft') ] # Number 3: Soft Vote with weights # Some models will be more valuable than others # Fit & evaluate models for name, model in models: # Different model metrics for scoring in ('accuracy', 'roc_auc'): cross_validation(name, model, X, Y, scoring) # Fit model and make predictions fitted_model = model.fit(X_train, Y_train) Y_pred = fitted_model.predict(X_test) # Classification report & Confusion Matrix (needs separate training and evaluation process) classification_report(name, Y_test, Y_pred) confusion_matrix(name, Y_test, Y_pred)
def fit(self, X, y): # Split into categorical,numerical categories: self.cat_clf = pipeline.Pipeline((('cat-tf', CategoricalTransformer()), ('bnb', naive_bayes.BernoulliNB()))) self.num_clf = pipeline.Pipeline( (('num-tf', NumericalTransformer()), ('scaler', preprocessing.StandardScaler()), ('gnb', naive_bayes.GaussianNB()))) self.clf = ensemble.VotingClassifier( estimators=[('num-clf', self.num_clf), ('cat-clf', self.cat_clf)]) self.clf.fit(X, y)
def vote_comparison(vote_est, trainX, trainY): # Hard vote or majority rules hard_vote = ensemble.VotingClassifier(estimators=vote_est, voting='hard') hard_vote_cv = model_selection.cross_validate(hard_vote, trainX, trainY, cv=cv_split, return_train_score=True) hard_vote.fit(trainX, trainY) print('Hard Voting Training w/bin score mean: {:.2f}'.format(hard_vote_cv['train_score'].mean() * 100)) print('Hard Voting Test w/bin score mean: {:.2f}'.format(hard_vote_cv['test_score'].mean() * 100)) print('Hard Voting Test w/bin score 3*std: {:.2f}'.format(hard_vote_cv['test_score'].std() * 100 * 3)) print('-' * 10) # Soft vote or majority rules soft_vote = ensemble.VotingClassifier(estimators=vote_est, voting='soft') soft_vote_cv = model_selection.cross_validate(soft_vote, trainX, trainY, cv=cv_split, return_train_score=True) soft_vote.fit(trainX, trainY) print('Soft Voting Training w/bin score mean: {:.2f}'.format(soft_vote_cv['train_score'].mean() * 100)) print('Soft Voting Test w/bin score mean: {:.2f}'.format(soft_vote_cv['test_score'].mean() * 100)) print('Soft Voting Test w/bin score 3*std: {:.2f}'.format(soft_vote_cv['test_score'].std() * 100 * 3)) print('-' * 10)
def model(self, **kwargs): svm_params = {'C': 20000, 'gamma': 1e-3, 'kernel': 'rbf'} rfor_params = { 'criterion': 'entropy', 'max_depth': 17, 'max_features': 'auto', 'n_estimators': 150, 'random_state': 0 } return ensemble.VotingClassifier([ ('svm', svm.SVC(**svm_params)), ('rfor', ensemble.RandomForestClassifier(**rfor_params)) ])
def hard_vote_tune(trainX, trainY, vote_est, test): grid_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard') grid_hard_cv = model_selection.cross_validate(grid_hard, trainX, trainY, cv=cv_split, return_train_score=True) grid_hard.fit(trainX, trainY) # print('Hard Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}' # .format(grid_hard_cv['train_score'].mean() * 100)) # print('Hard Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}' # .format(grid_hard_cv['test_score'].mean() * 100)) # print('Hard Voting w/Tuned Hyperparameters Test w/bin score 3*std: {:.2f}' # .format(grid_hard_cv['test_score'].std() * 100 * 3)) # print('-' * 10) pre_result = grid_hard.predict(test) return pre_result # 68.899%
def real_test(x_train, y_train, x_test, y_test, FINAL_ALGO): results = {} voting_estimators = [] for classfier_name, clf in FINAL_ALGOS.iteritems(): voting_estimators.append((classfier_name, clf)) clf.fit(x_train, y_train) prediction = clf.predict(x_test) accuracy = metrics.accuracy_score(prediction, y_test) results[classfier_name] = accuracy clf = ensemble.VotingClassifier(estimators=voting_estimators) clf.fit(x_train, y_train) accuracy = metrics.accuracy_score(clf.predict(x_test), y_test) results["Voting"] = accuracy return results
def machine_learning(ticker): """ DOCSTRING """ features, labels, _ = extract_features(ticker) features_train, features_test, labels_train, labels_test = \ model_selection.train_test_split(features, labels, test_size=0.25) classifier_a = ensemble.VotingClassifier( [('Linear_SVC', svm.LinearSVC()), ('K_Neighbors', neighbors.KNeighborsClassifier()), ('Random_Forest', ensemble.RandomForestClassifier())]) classifier_a.fit(features_train, labels_train) accuracy = classifier_a.score(features_test, labels_test) predictions = classifier_a.predict(features_test) print('Accuracy:', accuracy) print('Prediction Spread:', collections.Counter(predictions)) return accuracy
def SklearnVotingClassifier(X_train, Y_train, X_test): """ :type X_train: numpy.ndarray :type X_test: numpy.ndarray :type Y_train: numpy.ndarray :rtype: List[numpy.ndarray] """ from sklearn import ensemble from sklearn import neighbors from sklearn import tree from sklearn import svm from sklearn import linear_model from sklearn import metrics from sklearn.preprocessing import StandardScaler #data normalization scaler = StandardScaler() norm_val = scaler.fit(np.vstack((X_train, X_test))) X_train = norm_val.transform(X_train) X_test = norm_val.transform(X_test) SVM_2 = svm.SVC(kernel='linear', C=1) SVM_2 = SVM_2.fit(X=X_train, y=Y_train) Logistic_model = linear_model.LogisticRegression() Logistic_model = Logistic_model.fit(X_train, Y_train) Decision_Tree = tree.DecisionTreeClassifier() Decision_Tree = Decision_Tree.fit(X_train, Y_train) KNN_2 = neighbors.KNeighborsClassifier(n_neighbors=5) KNN_2 = KNN_2.fit(X_train, Y_train) voting_classifier = ensemble.VotingClassifier(estimators=[ ('SVM', SVM_2), ('LogisticRegression', Logistic_model), ('DecisionTree', Decision_Tree), ('KNN', KNN_2) ], voting='hard') voting_classifier.fit(X_train, Y_train) vote_pred = voting_classifier.predict(X_test) print("Accuracy of Voting Classifier = " + str(metrics.accuracy_score(y_true, vote_pred) * 100)) return ([vote_pred])
def voting(X_tra, y_tra, X_val, y_val, index_no, classifier_num): # classifier_list = GVal.getPARA('classifier_list_PARA') # dVM[3400] = ['estimators', [21, 23, 25, 30, 31], [21, 23, 25, 30, 31]] estims = [] for i in range(len(dVM[3400][2])): clf_temp = (classifier_list[dVM[3400][2][i]][1], classifier_list[int( str(dVM[3400][2][i])[0:2])][0](X_tra, y_tra, X_val, y_val, index_no, dVM[3400][2][i])[0]) estims.append(clf_temp) y_tra, X_tra, y_val, X_val, weights = dataRegulationSKL( y_tra, X_tra, y_val, X_val, index_no) clf = skemb.VotingClassifier(estimators=estims, voting=dVM[3401][2]) clf.fit(X_tra, y_tra) return processLearning(clf, X_tra, y_tra, X_val, y_val)
def majority_vote(x_train, y_train): result_df = pd.DataFrame() foldnum = 0 for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0): foldnum += 1 [tr_data, val_data, tr_targets, val_targets] = helper.folds_to_split(x_train, y_train, train, val) tr_targets = tr_targets.as_matrix().ravel() val_targets = val_targets.as_matrix().ravel() final_estimators = [(i, FINAL_ALGOS[i]) for i in FINAL_ALGOS.keys()] clf = ensemble.VotingClassifier(estimators=final_estimators) clf.fit(tr_data, tr_targets) prediction = clf.predict(val_data) accuracy = metrics.accuracy_score(prediction, val_targets) result_df.loc[foldnum, "Voting"] = accuracy return result_df
def voting_model(): rf_est = ensemble.RandomForestClassifier(n_estimators = 750, criterion = 'gini', max_features = 'sqrt', max_depth = 3, min_samples_split = 4, min_samples_leaf = 2, n_jobs = 50, random_state = 42, verbose = 1) gbm_est = ensemble.GradientBoostingClassifier(n_estimators=900, learning_rate=0.0008, loss='exponential', min_samples_split=3, min_samples_leaf=2, max_features='sqrt', max_depth=3, random_state=42, verbose=1) et_est = ensemble.ExtraTreesClassifier(n_estimators=750, max_features='sqrt', max_depth=35, n_jobs=50, criterion='entropy', random_state=42, verbose=1) lr_est = LogisticRegression(penalty='l1', C=2, max_iter=100, solver='liblinear', n_jobs=32) des_tree_est = tree.DecisionTreeClassifier(criterion="entropy") voting_est = ensemble.VotingClassifier(estimators = [('rf', rf_est),('lr',lr_est),('gbm', gbm_est),('et', et_est),('ds',des_tree_est)], voting = 'soft', weights = [3,4,5,2,2], n_jobs = 50) return voting_est
def main(): train = pd.read_csv(sys.argv[1]) train = handle_missing_values(train) train = preprocess(train) # All parameters were optimized using grid search with cross-validation (GridSearchCV). rfc = ensemble.RandomForestClassifier(n_estimators=100, max_depth=9, max_features=3, min_samples_leaf=1e-5, min_samples_split=1e-5, criterion='entropy', random_state=360) gbc = ensemble.GradientBoostingClassifier(n_estimators=275, max_depth=6, max_features=9, min_samples_leaf=1e-9, min_samples_split=1e-9, subsample=0.9, random_state=360) clf = ensemble.VotingClassifier(estimators=[('rfc', rfc), ('gbc', gbc)], weights=[3.5, 6.5], voting='soft') x_train = train.drop(FTR_DVCAT, axis=1).values y_train = train[FTR_DVCAT].values if VALIDATING: cross_validate(clf, x_train, y_train) else: test = pd.read_csv(sys.argv[2]) test = preprocess(test) x_test = test.drop(FTR_DVCAT, axis=1).values y_test = test[FTR_DVCAT].values score = solve(clf, x_train, y_train, x_test, y_test) print(score)
movie_tfidf = extract_features(movie_sentiment_data.data) X_train, X_test, y_train, y_test = model_selection.train_test_split( movie_tfidf, movie_sentiment_data.target, test_size=0.30, random_state=42) # similar to nltk.NaiveBayesClassifier.train() clf1 = linear_model.LogisticRegression() clf1.fit(X_train, y_train) print('Logistic Regression performance: {}'.format(clf1.score(X_test, y_test))) clf2 = linear_model.SGDClassifier() clf2.fit(X_train, y_train) print('SGDClassifier performance: {}'.format(clf2.score(X_test, y_test))) clf3 = naive_bayes.MultinomialNB() clf3.fit(X_train, y_train) print('MultinomialNB performance: {}'.format(clf3.score(X_test, y_test))) clf4 = naive_bayes.BernoulliNB() clf4.fit(X_train, y_train) print('BernoulliNB performance: {}'.format(clf4.score(X_test, y_test))) voting_model = ensemble.VotingClassifier(estimators=[('lr', clf1), ('sgd', clf2), ('mnb', clf3), ('bnb', clf4)], voting='hard') voting_model.fit(X_train, y_train) print('Voting classifier performance: {}'.format( voting_model.score(X_test, y_test)))
def _make_ensemble(classifiers, voting='hard'): return ensemble.VotingClassifier( [('c' + str(i), classifier) for i, classifier in enumerate(classifiers)], voting=voting )
result = model_selection.cross_val_score(model,X,y,cv=kfold) print(f'Accuracy of RF: {result.mean()*100:.2f}%') ''' 2.3 ExtraTreesClassifier 极端随机树 (ET) 思想: ET是RF的变种,其区别在于采用全部数据集训练个体分类器,且节点选择随机、树算法选择随机 ''' model = ensemble.ExtraTreesClassifier(n_estimators=100, max_features=4, random_state=3) kfold = model_selection.KFold(n_splits=10,random_state=1) result = model_selection.cross_val_score(model,X,y,cv=kfold) print(f'Accuracy of ET: {result.mean()*100:.2f}%') """ 2.4 Voting算法 思想:设置多种不同个体分类器,使用投票法进行预测 """ base1 = tree.DecisionTreeClassifier() base2 = svm.SVC(gamma='auto') base3 = naive_bayes.GaussianNB() bases = [] bases.append(('Decision Tree',base1)) bases.append(('SVC',base2)) bases.append(('Naive Bayes', base3)) model = ensemble.VotingClassifier(estimators=bases) kfold = model_selection.KFold(n_splits=10,random_state=1) result = model_selection.cross_val_score(model,X,y,cv=kfold) print(f'Accuracy of Voting: {result.mean()*100:.2f}%')
inplace=False) titanic2.shape x_train = titanic2[0:titanic_train.shape[0]] x_train.shape x_train.info() y_train = titanic_train['Survived'] #create estimators for voting classifier dt_estimator = tree.DecisionTreeClassifier(random_state=100) #Model1 rf_estimator = ensemble.RandomForestClassifier(random_state=100) #Model2 ada_estimator = ensemble.AdaBoostClassifier(random_state=100) #Model3 voting_estimator = ensemble.VotingClassifier(estimators=[('dt', dt_estimator), ('rf', rf_estimator), ('ada', ada_estimator) ], voting='soft', weights=[10, 20, 25]) voting_grid = { 'dt__max_depth': [3, 5, 7], 'rf__n_estimators': [20], 'rf__max_features': [5, 6], 'rf__max_depth': [5], 'ada__n_estimators': [10] } grid_voting_estimator = model_selection.GridSearchCV(voting_estimator, voting_grid, cv=10, n_jobs=5) grid_voting_estimator.fit(x_train, y_train) print(grid_voting_estimator.grid_scores_)
max_depth=6, n_estimators=866, subsample=0.95)) RandomForest = (ensemble .RandomForestClassifier(max_features=4, min_samples_leaf=3, n_estimators=424)) estimators = [('svm', SVM), ('ada', AdaBoost), ('gb', GradientBoosting), ('rf', RandomForest)] model = ensemble.VotingClassifier(estimators=estimators, voting='hard', n_jobs=4) model.fit(X_train, y_train) # There is 1 test sample without a Fare variable (sample #152). Since # all our algorithms use the Fare variable, we'll just ignore this sample # and mark it as not surviving X_test = np.delete(X_test, (152), axis=0) X_test = scale(X_test) y_test_predict = model.predict(X_test) y_test_predict = np.insert(y_test_predict, (152), 0, axis=0) predictions = np.column_stack([np.array(range(892, 1310)), y_test_predict])
# In[ ]: from sklearn import model_selection, ensemble, svm import xgboost as xgb # initialise classifiers rf_clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=0) et_clf = ensemble.ExtraTreesClassifier(n_estimators=100, random_state=0) gb_clf = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=0) ada_clf = ensemble.AdaBoostClassifier(n_estimators=100, random_state=0) svm_clf = svm.LinearSVC(C=0.1, random_state=0) xgb_clf = xgb.XGBClassifier(n_estimators=100) e_clf = ensemble.VotingClassifier(estimators=[('xgb', xgb_clf), ( 'rf', rf_clf), ('et', et_clf), ('gbc', gb_clf), ('ada', ada_clf), ('svm', svm_clf)]) # score using cross validation clf_list = [xgb_clf, rf_clf, et_clf, gb_clf, ada_clf, svm_clf, e_clf] name_list = [ 'XGBoost', 'Random Forest', 'Extra Trees', 'Gradient Boosted', 'AdaBoost', 'Support Vector Machine', 'Ensemble' ] for clf, name in zip(clf_list, name_list): scores = model_selection.cross_val_score(clf, features, target, cv=10) print("Accuracy: %0.2f +/- %0.2f (%s 95%% CI)" % (scores.mean(), scores.std() * 2, name)) # **We choose SVM for as our predictor:**
('rfc', ensemble.RandomForestClassifier()), #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html ('knn', neighbors.KNeighborsClassifier()), #xgboost: http://xgboost.readthedocs.io/en/latest/model.html ('xgb', XGBClassifier()), ('lgbm',LGBMClassifier()) ] seed = 123 skf = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = seed ) #Hard Vote or majority rules vote_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard') vote_hard_cv = model_selection.cross_validate(vote_hard, data1_x_bin, data[Target], cv = skf,scoring='f1') vote_hard.fit(data1_x_bin, data[Target]) #print("Hard Voting Training w/bin score mean: {:.2f}". format(vote_hard_cv['train_score'].mean()*100)) print("Hard Voting Test w/bin score mean: {:.2f}". format(vote_hard_cv['test_score'].mean()*100)) print("Hard Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_hard_cv['test_score'].std()*100*3)) print('-'*10) #Soft Vote or weighted probabilities vote_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft') vote_soft_cv = model_selection.cross_validate(vote_soft, data1_x_bin, data[Target], cv = skf,scoring='f1') vote_soft.fit(data1_x_bin, data[Target]) #print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_soft_cv['train_score'].mean()*100)) print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_soft_cv['test_score'].mean()*100))