def ada_boost_classifier(X_train_res, X_test, y_train_res): clf = AdaBoostClassifier( base_estimator=dt_clf) # instance of adaboost classifier clf.set_params(base_estimator__criterion='gini', base_estimator__splitter='best', n_estimators=100) # tuned adaboost #clf.set_params(dt_clf,learning_rate = 1) #clf.set_params(n_estimators = 10, learning_rate = 1) ada_clf = clf.fit(X_train_res, y_train_res) # fitting model on sampled train data ada_predict = ada_clf.predict(X_test) # predict on test data ada_acc = accuracy_score(y_test, ada_predict) # accuracy score ada_kappa = cohen_kappa_score( y_test, ada_predict) # cohen kappa score of cohen_kappa accuracy = cross_val_score(clf, X_train_res, y_train_res, cv=10, scoring='accuracy') # 10-fold accuracy score f_score = cross_val_score(clf, X_train_res, y_train_res, cv=10, scoring='f1_micro') # 10-fold f1-score ada_accuracy, ada_f_score = accuracy.mean(), f_score.mean( ) # f1 and accuracy mean score #print "accuracy and f_score are: " return ada_accuracy, ada_f_score, ada_clf, ada_kappa # return ada_accuracy, ada_f_score,ada_clf,ada_kappa
def load_architecture(): ada_params_filename = logger.config_dict['BEST_ADA_L'] logger.log( "Loading params for ADA from {} ...".format(ada_params_filename)) with open(logger.get_model_file(ada_params_filename, "large")) as fp: ada_best_params = json.load(fp) ada_model = AdaBoostClassifier(DecisionTreeClassifier()) ada_model.set_params(**ada_best_params) xgb_params_filename = logger.config_dict['BEST_XGB_L'] logger.log( "Loading params for XGB from {} ...".format(xgb_params_filename)) with open(logger.get_model_file(xgb_params_filename, "large")) as fp: xgb_best_params = json.load(fp) xgb_model = XGBClassifier() xgb_model.set_params(**xgb_best_params) ensemble_weights = [0.5, 0.5] comb_model = VotingClassifier(estimators=[('ADA', ada_model), ('XGB', xgb_model)], voting='soft', weights=ensemble_weights, n_jobs=-1) logger.log("Finish loading best architecture {}".format(comb_model)) return comb_model
def heart(dataType): title = '{0} Ada Boost'.format(dataType) package = data.createData(dataType) xTrain = package.xTrain xTest = package.xTest yTrain = package.yTrain yTest = package.yTest param_range = list(range(1, 160, 10)) param = 'n_estimators' params = {'algorithm': 'SAMME.R'} clf = AdaBoostClassifier() clf.set_params(**params) plotter.plotValidationCurve(clf, xTrain, yTrain, param, param_range, graphTitle=title) plotter.plotLearningCurve(clf, title=title, xTrain=xTrain, yTrain=yTrain) title = 'Heart' clf.fit(xTrain, yTrain) plotter.plotConfusion(clf, title, ['Diameter narrowing ', 'Diameter not narrowing'], xTest, yTest)
class CustomEstimator(BaseEstimator): def __init__(self, C = 1, penalty = 'l2'): self.C = C self.penalty = penalty self._model = AdaBoost(n_estimators = 50) self._model.set_params(C = C, penalty = penalty) pass def transform(self, X, y=None): return self.score(X) def predict(self, X): return self._model.predict(X) def score(self, X, y=None): global global_X, global_y, oversampled_global_X, oversampled_global_y, global_i, global_indices temp_X = global_X.copy() temp_X.drop('target', axis = 1, inplace = True) score = self._model.score(temp_X.ix[global_indices[global_i][1]], global_y.ix[global_indices[global_i][1]]) if global_i == 4: global_i = 0 return score def fit(self, X, y=None): self._model.fit(oversampled_global_X, oversampled_global_y) return self
def predict_classifier(name_dataset, name_train, classifier, name_test, metric): """Run classifier""" if classifier == "ada_boost": estimator = AdaBoostClassifier(random_state=42, base_estimator=ComplementNB(alpha=0.01)) #estimator = AdaBoostClassifier(random_state=42, base_estimator= LogisticRegression(C= 50, max_iter= 100)) elif classifier == "extra_tree": estimator = ExtraTreesClassifier(random_state=SEED) elif classifier == "knn": estimator = KNeighborsClassifier() elif classifier == "logistic_regression": estimator = LogisticRegression(random_state=SEED) elif classifier == "naive_bayes": estimator = MultinomialNB() elif classifier == "naive_bayes_complement": estimator = ComplementNB() elif classifier == "passive_aggressive": estimator = PassiveAggressiveClassifier(random_state=SEED, max_iter=1000) elif classifier == "random_forest": estimator = RandomForestClassifier(random_state=SEED) elif classifier == "sgd": estimator = SGDClassifier(random_state=SEED, max_iter=1000) elif classifier == "svm": estimator = svm.LinearSVC(random_state=SEED, max_iter=1000) x_train, y_train, x_test, y_test = load_svmlight_files( [open(name_train, 'rb'), open(name_test, 'rb')]) load_estimator = False if load_estimator == True: joblib.load("escores/grid_" + name_dataset + "_" + classifier) # load estimator else: if not (len(classifier.split(",")) > 1): escores = cv.load_escores(name_dataset, classifier, 1) # test score 0 best_param_folds = cv.best_param_folds_no_frequency( escores, 0, metric) # best score per fold estimator.set_params(**best_param_folds) estimator.fit(x_train, y_train) y_pred = estimator.predict(x_test) cv.save_dict_list([y_test], [y_pred], 'y_pred/' + name_dataset + "_" + classifier + "_" + metric + "_" + cv.name_file(name_test))
class MyAdaboost(MyModel): def __init__(self): super().__init__() self.name = "Adaboost" self.is_ensemble = True self.chinese_name = "自适应提升算法" self.english_name = "Adaptive Boosting" self.model = AdaBoostClassifier() # 固定每个模型的随机种子 self.model.set_params(**{'random_state': self.random_state})
def post_pruning_boosting_tree_performance(): pruning_tree = DecisionTreeClassifier(ccp_alpha=0.015) num_trees_list = [i + 1 for i in range(20)] train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) acc_train_cancer_list = [] acc_test_cancer_list = [] boost_classifier = AdaBoostClassifier(pruning_tree, n_estimators=1) for num_trees in num_trees_list: boost_classifier.set_params(n_estimators=num_trees) boost_classifier.fit(train_features_cancer, train_labels_cancer) acc_train_cancer = boost_classifier.score(train_features_cancer, train_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer = boost_classifier.score(test_features_cancer, test_labels_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] for num_trees in num_trees_list: boost_classifier.set_params(base_estimator__ccp_alpha=0.005, n_estimators=num_trees) boost_classifier.fit(train_features_spam, train_labels_spam) acc_train_spam = boost_classifier.score(train_features_spam, train_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam = boost_classifier.score(test_features_spam, test_labels_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_trees_list, acc_train_cancer_list, label='train') plt.plot(num_trees_list, acc_test_cancer_list, label='test') plt.xlabel('num of trees') plt.ylabel('accuracy') plt.title( 'post-pruning boosting cancer classifer \nperformance vs number of boosting trees' ) plt.legend(loc='upper right') plt.subplot(122) plt.plot(num_trees_list, acc_train_spam_list, label='train') plt.plot(num_trees_list, acc_test_spam_list, label='test') plt.xlabel('num of trees') plt.ylabel('accuracy') plt.title( 'post-pruning boosting spam classifer \nperformance vs number of boosting trees' ) plt.legend(loc='upper right') plt.show()
def test_evaluation_function(self): X = [] # pre-train chosen for test data if self.pre_train_chosen == 'bag-of-words': X = self.read_data_bag_of_words_function(self.datafile_name_test) elif self.pre_train_chosen == 'word-embedding': X = self.read_data_embedding_function(self.datafile_name_test) # test model is SVM, if pre-train is word-embedding, it is time-consuming method # user should change another model or pre-train method if self.model_chosen == 'SVM': if self.pre_train_chosen == 'word-embedding': print( "It's time consuming for svm model using word-embedding method, change another model" ) else: optimal_svm = SVC() optimal_svm.set_params(**self.hyper_para) optimal_svm.fit(self.X, self.y) y_pred = optimal_svm.predict(X) print(y_pred) # test model is adaboost elif self.model_chosen == 'adaboost': DTC = tree.DecisionTreeClassifier(random_state=11, max_features="auto", max_depth=None) optimal_abc = AdaBoostClassifier(base_estimator=DTC) optimal_abc.set_params(**self.hyper_para) optimal_abc.fit(self.X, self.y) y_pred = optimal_abc.predict(X) print(y_pred) # test model is Logistic Regression elif self.model_chosen == 'Logistic Regression': optimal_logreg = LogisticRegression() optimal_logreg.set_params(**self.hyper_para) optimal_logreg.fit(self.X, self.y) y_pred = optimal_logreg.predict(X) print(y_pred) numpy.savetxt('predicted-labels', y_pred, fmt='%d', delimiter=',') # test model is Naive Bayes, if pre-train method is word-embedding # Naive Bayes doesn't support continuous features # change the model or pre-train method elif self.model_chosen == 'Naive Bayes': if self.pre_train_chosen == 'word-embedding': print( "not suitable for Naive Bayes classifier using word-embedding, because of the continuous feature" ) else: optimal_nb = MultinomialNB() optimal_nb.set_params(**self.hyper_para) optimal_nb.fit(self.X, self.y) y_pred = optimal_nb.predict(X) print(y_pred)
def train_and_save_final_model(X, y, X_train, y_train, params, save_model_file_path, test_data): adbc = AdaBoostClassifier(random_state=0) adbc.set_params(**params) if test_data == None: adbc.fit(X_train, y_train) else: adbc.fit(X, y) #save model model_file_path = save_model_file_path + 'adbc.sav' pickle.dump(adbc, open(model_file_path, 'wb'))
def getAdaBoostBDTClassifier(options={}): """the standard BDT classifer based on AdaBoost""" dt = DecisionTreeClassifier(criterion="gini", max_depth=5, min_samples_leaf=0.05, random_state=0) bdt = AdaBoostClassifier(dt, n_estimators=200, learning_rate=0.13, algorithm='SAMME', random_state=0) bdt.set_params(options={}) return bdt
def classifier(self, scoring, cv, eval_using): adaclf = AdaBoostClassifier(algorithm='SAMME') xtr = StandardScaler().fit_transform(self.xtr) xte = StandardScaler().fit_transform(self.xte) # iterate over each grid score for param tuner for score in scoring: print('Tuning parameters of inital classifiers...') passive_params = param_tuner(PassiveAggressiveClassifier(), score=score, cv=cv, xtr=xtr, ytr=self.ytr) passclf = PassiveAggressiveClassifier().set_params(**passive_params) sgd_params = param_tuner(SGDClassifier(), score=score, cv=cv, xtr=xtr, ytr=self.ytr) sgdclf = SGDClassifier().set_params(**sgd_params) # cant use resampling/bagging with passive aggressive classifier # will raise ValueError: The number of class labels must be > 1 # since resampling may results in training sets with 1 class. print('\n'+'Tuning meta-classifiers with tuned classifier/s...') bagsgd_params = param_tuner(BaggingClassifier(sgdclf), score=score, cv=cv, xtr=xtr, ytr=self.ytr) bg_sgdclf = BaggingClassifier(sgdclf).set_params(**bagsgd_params) adasgd_params = param_tuner(adaclf.set_params(base_estimator=sgdclf), score =score, cv=cv, xtr=xtr, ytr=self.ytr) ada_sgdclf = adaclf.set_params(**adasgd_params) print('Voting on meta-classifiers/classifiers then predicting...') vote = VotingClassifier(estimators=[('BagSGD', bg_sgdclf), ('adaboostSGD', ada_sgdclf), ('Passive', passclf)], voting='hard').fit(xtr, self.ytr) start = time.time() y_true, y_pred = self.yte, vote.predict(xte) print('\n' + '-'*5, 'FINAL PREDICTION RESULTS','-'*5 +'\n', '{0:.4f}'.format(time.time()-start)+'--prediction time(secs)') clf_evaluation = report(*eval_using, y_true=y_true, y_pred=y_pred) for reports in clf_evaluation: print('---',reports) print(clf_evaluation[reports])
def accuracy_vs_num_tree(): max_depth_tree = DecisionTreeClassifier(max_depth=3) num_trees_list = [i + 1 for i in range(100)] train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer( ) acc_train_cancer_list = [] acc_test_cancer_list = [] boost_classifier = AdaBoostClassifier(max_depth_tree, n_estimators=1) for num_trees in num_trees_list: boost_classifier.set_params(n_estimators=num_trees) boost_classifier.fit(train_features_cancer, train_labels_cancer) acc_train_cancer = boost_classifier.score(train_features_cancer, train_labels_cancer) acc_train_cancer_list.append(acc_train_cancer) acc_test_cancer = boost_classifier.score(test_features_cancer, test_labels_cancer) acc_test_cancer_list.append(acc_test_cancer) train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam( ) acc_train_spam_list = [] acc_test_spam_list = [] for num_trees in num_trees_list: boost_classifier.set_params(n_estimators=num_trees) boost_classifier.fit(train_features_spam, train_labels_spam) acc_train_spam = boost_classifier.score(train_features_spam, train_labels_spam) acc_train_spam_list.append(acc_train_spam) acc_test_spam = boost_classifier.score(test_features_spam, test_labels_spam) acc_test_spam_list.append(acc_test_spam) plt.figure(figsize=(10, 6)) plt.subplot(121) plt.plot(num_trees_list, acc_train_cancer_list, label='train') plt.plot(num_trees_list, acc_test_cancer_list, label='test') plt.xlabel('num of trees') plt.ylabel('accuracy') plt.title('cancer accuracy vs number of boosting trees') plt.legend(loc='upper right') plt.subplot(122) plt.plot(num_trees_list, acc_train_spam_list, label='train') plt.plot(num_trees_list, acc_test_spam_list, label='test') plt.xlabel('num of trees') plt.ylabel('accuracy') plt.title('spam accuracy vs number of boosting trees') plt.legend(loc='upper right') plt.show()
def adbTuning(self, pX, change = 3): n = pX.shape[0] adb = AdaBoostClassifier() best_auc = 0 best_param = None for i in range(change): params = { 'n_estimators': 3+int(10*np.random.random()), 'random_state':2016 } adb.set_params(**params) auc = cross_val_score(adb, pX, self.y, scoring="roc_auc").mean() if auc > best_auc: best_auc = auc best_param = params print 'adaboost ' + str(best_auc) return best_auc, AdaBoostClassifier(**best_param)
def _get_model(self, problem_transform=ClassifierChain): self._load_models_hyperparams() adaboost_model = AdaBoostClassifier(DecisionTreeClassifier()) adaboost_model.set_params(**self.adab_hyperparams) randf_model = RandomForestClassifier() randf_model.set_params(**self.randf_hyperparams) ensemble_model = problem_transform( VotingClassifier(estimators=[('ADA', adaboost_model), ('RANDF', randf_model)], voting='soft', weights=[0.45, 0.55], n_jobs=-1)) return ensemble_model
def runAdaBoostClassifier(x_train, y_train, x_test, y_test, p): # Here we instantiate the adaboost classifier clf = AdaBoostClassifier() clf.set_params(**p) clf.fit(x_train, y_train) # now, make the predictions using our classifier ada_predictions = clf.predict(x_test) # now we have to computer the classification accuracy # think about what two variables we have to compare dt_score = accuracy_score(y_test, ada_predictions) print("adaboost classification accuracy on test data is " + str(dt_score), file=sys.stderr) etc_predictions = clf.predict(x_test) dt_score = accuracy_score(y_test, etc_predictions) print("accuracy score on test data: " + str(dt_score), file=sys.stderr) train_score = accuracy_score(y_train, clf.predict(x_train)) print("accuracy score on training data: " + str(train_score), file=sys.stderr) return (train_score, dt_score)
print('最优参数:', grid.best_params_) bestclfParams = grid.best_params_ ############################ ######## 再对基学习器超参调优 ############################ # 可惜目前只能手工遍历 from sklearn.model_selection import cross_val_score mdl = DecisionTreeClassifier() clf = AdaBoostClassifier(base_estimator=mdl, algorithm='SAMME', n_estimators=200, learning_rate=0.8, random_state=10) clf.set_params(**bestclfParams) # 定义调参步骤 dtcParams = { 'max_depth': range(3, 14, 2), #第一次 'min_samples_split': range(50, 201, 20), 'min_samples_leaf': range(10, 60, 10) } # dtcParams_2 = {'max_features':np.linspace(0.5, 1.0,6)} #第二次 for var_name, var_vals in dtcParams.items(): best_scores = [] best_Params = [] for val in var_vals: dv = {var_name: val} best_Params.append(dv)
import sys from os import path sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) from kaggle_io.extract_inputs import extract_training_data from sklearn.preprocessing import StandardScaler from sklearn.ensemble import AdaBoostClassifier from sklearn.externals import joblib from CvModel import CvModel Id, X, y = extract_training_data('data/kaggle_train_tf_idf.csv') n_folds = 5 scaler = StandardScaler().fit(X) ada = AdaBoostClassifier() print 'Training AdaBoost with n_estimators=10' ada.set_params(n_estimators=10) cv_ada = CvModel(n_folds, scaler, ada) cv_ada.fit(X, y) joblib.dump(cv_ada, 'ada1/1.pkl') print 'Training AdaBoost with n_estimators=50' ada.set_params(n_estimators=50) cv_ada = CvModel(n_folds, scaler, ada) cv_ada.fit(X, y) joblib.dump(cv_ada, 'ada1/2.pkl')
plt.plot(np.arange(1, len(Ein)+1), Eval, label="Validation") plt.title('AdaBoost estimators behaviour') plt.xlabel('Number of estimators') plt.ylabel('Score') plt.legend() plt.show() estimators = np.arange(MIN_ESTIMATORS, MAX_ESTIMATORS) clf = RandomizedSearchCV(estimator=ada, scoring='roc_auc', param_distributions=dict(n_estimators=estimators), n_jobs=-1, random_state=SEED, cv=kfold) model = clf.fit(X_train, y_train) best_number_estimators = model.best_estimator_.get_params()['n_estimators']""" best_number_estimators = 130 print("Best number of estimators for AdaBoost: ", best_number_estimators) ada.set_params(n_estimators=best_number_estimators) ada.fit(X_train, y_train) scoreModel(ada, X_train, y_train, 'Train') ################################################################################################################## # Random Forest (RF) ################################################################################################################## print("\nFinding optimal number of trees for Random Forest...") rf = RandomForestClassifier(n_jobs=-1, random_state=SEED, criterion='entropy', bootstrap=True, max_features='auto', class_weight='balanced',
def CreatesFeatsPipeline(pipe_name, init_params=None): """ load pre-existing pipelines """ pipeline = [] if pipe_name == 'cla_ERP_TS_LR': # pipeline using Xdawn with MDM pipeline = sklearn.pipeline.Pipeline([ ('xdawn', pyriemann.estimation.XdawnCovariances()), ('TS', pyriemann.tangentspace.TangentSpace()), ('lr', sklearn.linear_model.LogisticRegression()) ]) elif pipe_name == 'cla_ERP_LR': pipeline = sklearn.pipeline.Pipeline([ ('preproc', Epochs2signals()), ('xdawn', pyriemann.estimation.XdawnCovariances()), ('TS', pyriemann.tangentspace.TangentSpace()), ('lr', sklearn.linear_model.LogisticRegression()) ]) elif pipe_name == 'cla_CSP_LR': pipeline = sklearn.pipeline.Pipeline([ ("cov", pyriemann.estimation.Covariances(estimator='lwf')), ('CSP', pyriemann.spatialfilters.CSP(nfilter=12, log=False)), ('TS', pyriemann.tangentspace.TangentSpace()), ('lr', sklearn.linear_model.LogisticRegression(solver='lbfgs')) ]) elif pipe_name == 'cla_CSP_MDM': pipeline = sklearn.pipeline.Pipeline([ ("cov", pyriemann.estimation.Covariances(estimator='lwf')), ('CSP', pyriemann.spatialfilters.CSP(nfilter=8, log=False)), ('MDM', pyriemann.classification.MDM()) ]) elif pipe_name == 'cla_MDM': pipeline = sklearn.pipeline.Pipeline([ ("cov", pyriemann.estimation.Covariances(estimator='lwf')), ('MDM', pyriemann.classification.MDM()) ]) elif pipe_name == 'reg_CSP': # pipeline using Xdawn in the tangent space (regression) pipeline = sklearn.pipeline.Pipeline([ ("cov", pyriemann.estimation.Covariances(estimator='lwf')), ('CSP', pyriemann.spatialfilters.CSP(nfilter=12, log=False)), ('TS', pyriemann.tangentspace.TangentSpace()), ('LASSO', sklearn.linear_model.LassoCV()) ]) elif pipe_name == 'reg_ERP': # pipeline using Xdawn in the tangent space (regression) pipeline = sklearn.pipeline.Pipeline([ ('xdawn', pyriemann.estimation.XdawnCovariances(estimator='lwf', xdawn_estimator='lwf')), ('TS', pyriemann.tangentspace.TangentSpace()), ('LASSO', sklearn.linear_model.LassoCV()) ]) elif pipe_name == 'reg_ERP_svr': # pipeline using Xdawn in the tangent space pipeline = sklearn.pipeline.Pipeline([ ('preproc', Epochs2signals()), ('xdawn', XdawnCovariancesRegression()), ('TS', pyriemann.tangentspace.TangentSpace()), ('LASSO', sklearn.model_selection.GridSearchCV(sklearn.svm.SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1), cv=5, param_grid={ "C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5) })) ]) elif pipe_name == 'reg_FilterBank': f_list = range(len(init_params['preproc__filters'])) pipFreqs = [] for freq in f_list: pipFreqs.append(( "freq" + str(freq), sklearn.pipeline.Pipeline([ ('CospSelector', CospSelector(f_list=[freq])), ('Cov', pyriemann.estimation.Covariances(estimator='lwf')) # ,('xdawn',XdawnCovariancesRegression(nfilter=8,estimator='lwf',xdawn_estimator='lwf',bins=[0,32,72,100])) , ('SPOC', pyriemann.spatialfilters.SPoC(nfilter=20, log=False)) # ,('TS',pyriemann.tangentspace.TangentSpace()) , ('cosp2Feats', Cosp2feats()) ]))) union = sklearn.pipeline.FeatureUnion(pipFreqs) pipeline = sklearn.pipeline.Pipeline([ ('preproc', Epochs2signals()), ('union', union), ('LASSO', sklearn.linear_model.LassoCV()) ]) elif pipe_name == 'reg_SPOC': pipeline = sklearn.pipeline.Pipeline([ ('Cov', pyriemann.estimation.Covariances()), ('SPOC', pyriemann.spatialfilters.SPoC(log=False)), ('TS', pyriemann.tangentspace.TangentSpace()), ('LASSO', sklearn.linear_model.LassoCV()) ]) elif pipe_name == "vot_ADA": pipeline = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), algorithm="SAMME", n_estimators=200) else: print('no pipeline recognized') assert False # initialize parameters of the pipeline if init_params is not None: pipeline.set_params(**init_params) # initialize the parameters else: print('CreatesFeatsPipeline: ' + pipe_name + ' not initialized!') return pipeline
class AdaBoost(object): def __init__(self, dataset_x, dataset_y): self.dataset_x = dataset_x self.dataset_y = dataset_y self.clf = AdaBoostClassifier() self.best_parameter = {} def startAdaBoost(self): print("------------------ AdaBoost Classifier -------------------") # self.findBestParameters() self.gridSearch() # self.randomSearch() def findBestParameters(self): """ Try different parameters for finding the best score :return: """ self.clf = AdaBoostClassifier() scores = cross_val_score(self.clf, self.dataset_x, self.dataset_y, cv=10, scoring="accuracy") print(scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) def test(self): """ Test the model with best parameters found in randomSearch() or gridSearch() :return: """ # self.clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, learning_rate=1.5, algorithm='SAMME.R') self.clf = AdaBoostClassifier() self.clf.set_params(**self.best_parameter) print("*** Test Result for AdaBoost ***") ModelEvaluation.evaluateModelWithCV(self.clf, self.dataset_x, self.dataset_y, cv=10) def randomSearch(self): tuned_parameters = { 'base_estimator': [DecisionTreeClassifier(), LogisticRegression(), MultinomialNB()], 'n_estimators': [50, 100, 150], 'learning_rate': [0.5, 1.0, 1.5], 'algorithm': ['SAMME'] } self.best_parameter = SearchParameters.randomSearch( classifier=self.clf, parameters=tuned_parameters, cv=10, n_iter=30, train_x=self.dataset_x, train_y=self.dataset_y) def gridSearch(self): tuned_parameters = { 'base_estimator': [DecisionTreeClassifier()], 'n_estimators': [50, 100, 150], 'learning_rate': [0.5, 1.0, 1.5], 'algorithm': ['SAMME'] } self.best_parameter = SearchParameters.gridSearch( classifier=self.clf, parameters=tuned_parameters, cv=10, train_x=self.dataset_x, train_y=self.dataset_y)
# search = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42), # grid, make_scorer(f1_score), cv=StratifiedKFold(labels), n_jobs=-1) # search.fit(features, labels) # print search.best_score_ # print search.best_params_ # clf = search.best_estimator_ ### To speed up the process of training the grid search is not included and the best parameters used. ### This is as recommended by the reviewer best_params = { 'n_estimators': 4, 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 11} clf = AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42) clf.set_params(**best_params) ## Task 6: Dump your classifier, dataset, and features_list so anyone can ## check your results. You do not need to change anything below, but make sure ## that the version of poi_id.py that you submit can be run on its own and ## generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
def without_penalty(X, y): clf = AdaBoostClassifier() clf.set_params(**params) clf.fit(X, y) return clf
def boosting(X, y, split_amount, plot=True, X_test=None, y_test=None): training_amount = 1 - split_amount X_train = None y_train = None if X_test is None and y_test is None: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_amount, train_size=training_amount, shuffle=True) else: X_train = X y_train = y boost_classifier = AdaBoostClassifier(algorithm='SAMME.R', n_estimators=60) estimators_range = range(50, 150, 5) train_scores, test_scores = validation_curve(boost_classifier, X_train, y_train, param_name='n_estimators' , param_range=estimators_range, cv=5, n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) best_num_estimators = estimators_range[list(test_scores_mean).index(max(test_scores_mean))] boost_classifier.set_params(n_estimators=best_num_estimators) training_sizes = np.linspace(.1, 1.0, 5) train_sizes, train_scores_learn, test_scores_learn = learning_curve(boost_classifier, X_train, y_train, train_sizes=training_sizes, cv=5) train_scores_learn_mean = np.mean(train_scores_learn, axis=1) train_scores_learn_std = np.std(train_scores_learn, axis=1) test_scores_learn_mean = np.mean(test_scores_learn, axis=1) test_scores_learn_std = np.std(test_scores_learn, axis=1) boost_classifier.fit(X_train, y_train) measure_performance(X_test, y_test, boost_classifier) if plot: lw=2 plt.figure() plt.grid() plt.title("Boosting Validation Curve") plt.plot(estimators_range, train_scores_mean, label='training_score', color='darkorange') plt.fill_between(estimators_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) plt.plot(estimators_range, test_scores_mean, label='cross_validation_score', color='navy') plt.fill_between(estimators_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) plt.legend() plt.xlabel('Number of Estimators') plt.ylabel('Score') title = "Boosting Learning Curve (n_estimators = " + str(best_num_estimators) + " )" plt.figure(2) plt.grid() plt.title(title) plt.fill_between(train_sizes, train_scores_learn_mean - train_scores_learn_std, train_scores_learn_mean + train_scores_learn_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_learn_mean - test_scores_learn_std, test_scores_learn_mean + test_scores_learn_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_learn_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_learn_mean, 'o-', color="g", label="Test score") plt.xlabel('Training Sizes') plt.ylabel('Score') plt.legend() plt.show()
def train_BTree(filename, X_train, X_test, y_train, y_test, full_param=False, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pBTree={}): np.random.seed(1) start = time.time() algo = 'Boosted Tree' if len(pBTree) == 0: if full_param: param_grid = [{'base_estimator__criterion' : ['gini', 'entropy'], 'base_estimator__max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100], # 'base_estimator__min_samples_split': [2, 3, 5, 6, 8, 10], # 'base_estimator__min_samples_leaf' : [1, 2, 3, 5, 6, 8, 10], # 'base_estimator__max_features' : [0.9, 1.0], # 0.1, 0.3, 0.5, 'base_estimator__max_leaf_nodes': [10, 100], # 2, 4, 5, 7, 'base_estimator__ccp_alpha' : [0.0, 0.005, 0.01], # 0.015, 0.02, 0.025, 0.030, 0.035, 0.04], "base_estimator__splitter" : ["best"], # "random"], "n_estimators" : [1, 50, 100, 150, 200, 250, 300], "learning_rate" : [0.1, 0.5, 1], 'random_state' : [1] }] else: param_grid = [{'base_estimator__criterion': ['gini', 'entropy'], 'base_estimator__max_depth': [3, 5, 7, 10], 'base_estimator__ccp_alpha': [0.0, 0.005, 0.01, 0.035], # 'base_estimator__min_samples_split': [3, 5, 7, 10], # 'base_estimator__ccp_alpha' : [0.0, 0.005, 0.015, 0.025, 0.35, 0.04], "n_estimators" : [1, 50, 100, 150], # "learning_rate" : [0.1, 0.5, 1], 'random_state' : [1] }] DTC = DecisionTreeClassifier(random_state=11) adaTree = AdaBoostClassifier(base_estimator=DTC) # run grid search grid_search = GridSearchCV(adaTree, param_grid=param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ best_params = grid_search.best_params_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar) btree_classifier = AdaBoostClassifier(base_estimator=DTC) btree_classifier.set_params(**best_params) else: DTC = DecisionTreeClassifier() btree_classifier = AdaBoostClassifier(base_estimator=DTC) btree_classifier.set_params(**pBTree) start = time.time() btree_classifier.fit(X_train, y_train) print('BTree Fit Time: ', time.time() - start) start = time.time() y_prob = btree_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('BTree Train Score Time: ', time.time() - start) start = time.time() y_prob = btree_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('BTree Test Score Time: ', time.time() - start) DTC = DecisionTreeClassifier() test_class = AdaBoostClassifier(base_estimator=DTC) test_class.set_params(**pBTree) if make_graphs: util.boost_lr_vs_nest(X_train, y_train, pBTree, njobs, filename[:-4], train_score) util.compute_vc(algo, 'n_estimators', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs, debug=debug, extraText='log') util.plot_learning_curve(btree_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) util.compute_vc(algo, 'base_estimator__max_depth', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80, 90, 100], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'base_estimator__max_leaf_nodes', [2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 50, 75, 100, 200, 500, 1000, 10000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs) # computer Model Complexity/Validation curves util.compute_vc(algo, 'base_estimator__criterion', ['gini', 'entropy'], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs) util.compute_vc(algo, 'n_estimators', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs, debug=debug) util.compute_vc(algo, 'n_estimators', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs, debug=debug, extraText='log') util.compute_vc(algo, 'learning_rate', [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'base_estimator__ccp_alpha', [0.000001, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009, 0.0001, 0.00011, 0.00012, 0.00013, 0.00014, 0.00015, 0.00016, 0.00017, 0.00018, 0.00019, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.01, 0.1, 1], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs) util.compute_vc(algo, 'base_estimator__min_samples_split', [2, 3, 5, 6, 8, 10], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs) util.compute_vc(algo, 'base_estimator__min_samples_leaf', [1, 2, 3, 5, 6, 8, 10, 25, 50, 75, 100, 250, 500, 750, 1000], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs) util.compute_vc(algo, 'base_estimator__max_features', [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 0.99999, 1.0], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs) util.compute_vc(algo, 'base_estimator__splitter', ["best", "random"], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs) return time.time() - start, round(train_score, 4), round(test_score, 4)
class AdaBoost(Classifier): r"""Implementation of AdaBoost classifier. Date: 2020 Author: Luka Pečnik License: MIT Reference: Y. Freund, R. Schapire, “A Decision-Theoretic Generalization of on-Line Learning and an Application to Boosting”, 1995. Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html See Also: * :class:`niaaml.classifiers.Classifier` """ Name = 'AdaBoost' def __init__(self, **kwargs): r"""Initialize AdaBoost instance. """ warnings.filterwarnings(action='ignore', category=ChangedBehaviorWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=DataDimensionalityWarning) warnings.filterwarnings(action='ignore', category=EfficiencyWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=NonBLASDotWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) self._params = dict(n_estimators=ParameterDefinition( MinMax(min=10, max=111), np.uint), algorithm=ParameterDefinition(['SAMME', 'SAMME.R'])) self.__ada_boost = AdaBoostClassifier() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__ada_boost.set_params(**kwargs) def fit(self, x, y, **kwargs): r"""Fit AdaBoost. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. y (pandas.core.series.Series): n classes of the samples in the x array. """ self.__ada_boost.fit(x, y) def predict(self, x, **kwargs): r"""Predict class for each sample (row) in x. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. Returns: pandas.core.series.Series: n predicted classes. """ return self.__ada_boost.predict(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return Classifier.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__ada_boost.get_params()))
#!/usr/bin/python2.7 from numpy import average, logspace, load import sys from sklearn.ensemble import AdaBoostClassifier from ML import kfold as kf, GridSearch as gs #load data samples = load(sys.argv[1]) lables = load("labels.npy") folds = int(sys.argv[2]) if sys.argv[2] else 10 clf = AdaBoostClassifier() estimators_range = range(5, 110, 20) param_grid = dict(n_estimators=estimators_range) # gridsearh = gs(clf, param_grid, samples, lables) # best_param = gridsearh.search() clf.set_params(n_estimators=100) #best_param['n_estimators']) #print best_param['n_estimators'] kfold = kf(clf, samples, lables, folds) res = kfold.fit() for score in kfold.results['scores']: print score.get_accuracy() print average([score.get_accuracy() for score in kfold.results['scores']])
### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html best_params = { 'n_estimators': 4, 'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 11 } clf = AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42) clf.set_params(**best_params) # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) clf.fit(features_train, labels_train) pred = clf.predict(features_test) print accuracy_score(pred, labels_test) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results.
train_labels, problem_name, plot_dir) param_dist = { 'n_estimators': [25, 50, 75, 100, 150, 200, 300], 'learning_rate': stats.uniform(0.75, 0.25) } scoring_metric = 'f1' opt_param_set_from_random_search = perf_random_search_for_best_hyper_params( clf, train_features, train_labels, scoring_metric, param_dist, n_iter_search=20, n_jobs=4, cv=5) clf.set_params(**opt_param_set_from_random_search) plot_learning_curves_helper(clf, train_features, train_labels, scoring_metric, plot_dir, problem_name) clf.fit(train_features, train_labels.values.ravel()) plot_opt_model_perf(clf, test_features, test_labels, [0, 1], problem_name, plot_dir) store_model(clf, model_path)
grid_search = GridSearchCV(bdt_real, param_grid=param_grid, cv=10) grid_search.fit(X_train, y_train) print 'Best parameters of Adaboost SAMME.R:' , grid_search.best_params_ print 'Best scrore of Adaboost SAMME.R:', grid_search.best_score_ grid_search = GridSearchCV(bdt_discrete, param_grid=param_grid, cv=10) grid_search.fit(X_train, y_train) print 'Best parameters of Adaboost SAMME:' , grid_search.best_params_ print 'Best scrore of Adaboost SAMME:', grid_search.best_score_ num_estimators = X_train.shape[0] bdt_real.set_params(n_estimators=num_estimators) bdt_discrete.set_params(n_estimators=num_estimators) bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) real_test_errors = [] discrete_test_errors = [] ypred_r = bdt_real.predict(X_test) ypred_e = bdt_discrete.predict(X_test) print 'Accuracy of SAMME.R: {} '.format(bdt_real.score(X_test, ypred_r)) print 'Accuracy of SAMME: {}'.format(bdt_discrete.score(X_test, ypred_e)) print("--- %s seconds ---" % (time.time() - start_time))
# We can now compute the performance of the model on new, held out data from the **test set**: # In[16]: #test_score = svc.score(X_test, y_test) test_score = abc.score(X_test_scaled, y_test) print 'test_score' print test_score print 'abc' print abc params = { 'base_estimator': DC(max_depth=5) } print 'changing base estimator' abc.set_params(**params) #abc.base_estimator = DC(max_depth=5, min_samples_leaf=0.1*len(X_train)) abc.fit(X_train_scaled, y_train) print 'new train score' print abc.score(X_train_scaled, y_train) # This score is clearly not as good as expected! The model cannot generalize so well to new, unseen data. # # - Whenever the **test** data score is **not as good as** the **train** score the model is **overfitting** # # - Whenever the **train score is not close to 100%** accuracy the model is **underfitting** # # Ideally **we want to neither overfit nor underfit**: `test_score ~= train_score ~= 1.0`. # The previous example failed to generalized well to test data because we naively used the default parameters of the `SVC` class: # In[17]:
"base_estimator__max_depth": [None, 3, 5, 8, 10], "base_estimator__min_samples_leaf": [1, 2, 3, 5, 8], "learning_rate": [0.01, 0.1, 0.5, 0.8, 1.0], } searcher = GridSearchCV(adabst, param_grid, f2_score, n_jobs=2, verbose=1, cv=StratifiedKFold(labels, 10)) searcher.fit(features, labels) # Apply tuned parameters to the model adabst.set_params(**searcher.best_params_) else: # Result I got when I ran the above searching adabst.set_params(base_estimator__max_features="sqrt", base_estimator__min_samples_leaf=1, base_estimator__max_depth=3, learning_rate=0.01) sys.stdout.write("Done\n") ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results.
def Adaboost_Classifier_Mul(X_raw_train, y_raw_train, X_test, y_test, Cali_method): mean_cv_scores_df = pd.DataFrame(columns=[ 'accuracy', 'average_precision', 'f1', 'roc_auc', 'RMSE', 'MXE', 'APaccuracy', 'BEP_score' ]) mean_final_scores_df = pd.DataFrame(columns=[ 'accuracy', 'average_precision', 'f1', 'roc_auc', 'RMSE', 'MXE', 'APaccuracy', 'BEP_score' ]) index_name = [] ##采用5次五折交叉验证法 kf = KFold(n_splits=5) dt = DecisionTreeClassifier() gnb = GaussianNB() base_estimators = [{ 'name': 'Naive_Bayes', 'estimator': gnb }, { 'name': 'DecisionTree', 'estimator': dt }] n_estimator = [20, 30, 40, 50, 60, 70, 80, 90, 100] learning_rates = [0.2, 0.4, 0.6, 0.8, 1] for b in base_estimators: for n in n_estimator: for r in learning_rates: ada = AdaBoostClassifier(algorithm='SAMME') ada.set_params(base_estimator=b['estimator'], n_estimators=n, learning_rate=r) name = 'base_estimator=' + b['name'] + ' n_estimators=' + str( n) + ' learning_rates=' + str(r) index_name.append(name) cv_score, final_score = KFold_Mul_Experiment( b['estimator'], kf, X_raw_train, y_raw_train, X_test, y_test, Cali_method) #cv cv_scores = pd.DataFrame(cv_score, columns=[ 'accuracy', 'average_precision', 'f1', 'roc_auc', 'RMSE', 'MXE', 'APaccuracy', 'BEP_score' ]) cv_scores.loc['mean'] = cv_scores.apply(lambda x: x.mean()) mean_cv_scores = cv_scores.loc['mean'] mean_cv_scores_df = mean_cv_scores_df.append(mean_cv_scores, ignore_index=True) #final final_scores = pd.DataFrame(final_score, columns=[ 'accuracy', 'average_precision', 'f1', 'roc_auc', 'RMSE', 'MXE', 'APaccuracy', 'BEP_score' ]) final_scores.loc['mean'] = final_scores.apply( lambda x: x.mean()) mean_final_scores = final_scores.loc['mean'] mean_final_scores_df = mean_final_scores_df.append( mean_final_scores, ignore_index=True) #cv mean_cv_scores_df.index = index_name mean_cv_scores_dfmax = mean_cv_scores_df[[ 'accuracy', 'average_precision', 'f1', 'roc_auc', 'APaccuracy', 'BEP_score' ]] mean_cv_scores_dfmax.loc['best'] = mean_cv_scores_dfmax.apply( lambda x: x.argmax()) mean_cv_scores_dfmin = mean_cv_scores_df[['RMSE', 'MXE']] mean_cv_scores_dfmin.loc['best'] = mean_cv_scores_dfmin.apply( lambda x: x.argmin()) mean_cv_scores_df = pd.merge(mean_cv_scores_dfmax, mean_cv_scores_dfmin, left_index=True, right_index=True, how='outer') #final mean_final_scores_df.index = index_name mean_final_scores_dfmax = mean_final_scores_df[[ 'accuracy', 'average_precision', 'f1', 'roc_auc', 'APaccuracy', 'BEP_score' ]] mean_final_scores_dfmax.loc['OPT-SEL'] = mean_final_scores_dfmax.apply( lambda x: x.max()) mean_final_scores_dfmin = mean_final_scores_df[['RMSE', 'MXE']] mean_final_scores_dfmin.loc['OPT-SEL'] = mean_final_scores_dfmin.apply( lambda x: x.min()) mean_final_scores_df = pd.merge(mean_final_scores_dfmax, mean_final_scores_dfmin, left_index=True, right_index=True, how='outer') return mean_cv_scores_df, mean_final_scores_df
print 'Best scrore of Adaboost SAMME.R:', grid_search.best_score_ pdb.set_trace() grid_search = GridSearchCV(bdt_discrete, param_grid=param_grid, cv=10) grid_search.fit(X_train, y_train) print 'Best parameters of Adaboost SAMME:' , grid_search.best_params_ print 'Best scrore of Adaboost SAMME:', grid_search.best_score_ pdb.set_trace() ''' # Train on the training data set num_estimators = 600; bdt_real.set_params(n_estimators=num_estimators) bdt_discrete.set_params(n_estimators=num_estimators) bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) real_test_errors = [] discrete_test_errors = [] # Test on the testing data set and display the accuracies ypred_r = bdt_real.predict(X_test) ypred_e = bdt_discrete.predict(X_test) print 'Accuracy of SAMME.R = ', accuracy_score(ypred_r, y_test) print 'Accuracy of SAMME = ', accuracy_score(ypred_e, y_test) # Plot the relationship between error rates and number of trees
clf.fit(X, y) #applying grid search to find the best model from sklearn.model_selection import GridSearchCV parameters = [{ 'n_estimators': [100, 200, 500, 1000], 'learning_rate': [0.01, 0.1, 0.2, 0.5] }] grid_search = GridSearchCV(estimator=clf, param_grid=parameters, scoring='accuracy', cv=2) grid_search = grid_search.fit(X, y) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ clf.set_params(**best_parameters) # refit clf.fit(X, y) origin_size = True keep_top_k = 750 video_capture = cv2.VideoCapture(0) process_this_frame = True while True: # Grab a single frame of video ret, frame = video_capture.read() # Only process every other frame of video to save time if process_this_frame: # get the coordinate of the points
pst_classifier.score(data_test, label_test) # ### Ada Boost Classifier # In[145]: from sklearn.ensemble import AdaBoostClassifier from sklearn.metrics import mean_squared_error ada_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),learning_rate=0.70,n_estimators=120,random_state=49,algorithm="SAMME.R") ada_classifier.fit(data_train, label_train) print(ada_classifier.score(data_test, label_test)) ada_classifier.set_params(n_estimators=120) errors = [mean_squared_error(label_test, y_pred) for y_pred in ada_classifier.staged_predict(data_test)] bst_n_estimators = np.argmin(errors) print('bst_n_estimators',bst_n_estimators) ada_best = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),learning_rate=0.70,n_estimators=bst_n_estimators,random_state=49,algorithm="SAMME.R") ada_best.fit(data_train,label_train) ada_best.score(data_test,label_test) # ### Gradient Boost Classifier # In[146]: from sklearn.ensemble import GradientBoostingClassifier