class Ensemble: def __init__(self, data): self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy') self.lda = LDA() self.dec = DecisionTreeClassifier(criterion='entropy') self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25) self.make_prediction(data) def make_prediction(self, data): ''' Make an ensemble prediction ''' self.rf.fit(data.features_train, data.labels_train) self.lda.fit(data.features_train, data.labels_train) self.dec.fit(data.features_train, data.labels_train) self.ada.fit(data.features_train, data.labels_train) pre_pred = [] self.pred = [] ada_pred = self.ada.predict(data.features_test) rf_pred = self.rf.predict(data.features_test) lda_pred = self.lda.predict(data.features_test) dec_pred = self.dec.predict(data.features_test) for i in range(len(rf_pred)): pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ]) for entry in pre_pred: pred_list = sorted(entry, key=entry.count, reverse=True) self.pred.append(pred_list[0])
def boost_report(): svm_train_features = list() svm_train_classes = list() svm_test_features = list() svm_test_classes = list() for record in mit_records: svm_train_features.append(list(record.features.values())) svm_train_classes.append(record.my_class) for record in mim_records: svm_test_features.append(list(record.features.values())) svm_test_classes.append(record.my_class) svm_classifier = svm.SVC(kernel="linear", C=0.1) svm_classifier.fit(svm_train_features, svm_train_classes) print("linear kernel svm accuracy: " + str(svm_classifier.score(svm_test_features, svm_test_classes))) classifier = AdaBoostClassifier( base_estimator=svm_classifier, n_estimators=100, algorithm='SAMME') classifier.fit(svm_train_features, svm_train_classes) print("adaboost accuracy: " + str(classifier.score(svm_test_features, svm_test_classes)))
def training(baseclassparameters, adaparameters, queue): treeclassifier = DecisionTreeClassifier(**baseclassparameters) adaclassifier = AdaBoostClassifier(treeclassifier, **adaparameters) print "\nBegin calculation with {0} and {1}".format(str(baseclassparameters), str(adaparameters)) adaclassifier.fit(Xtrain, ytrain) #Predict with the model prob_predict_test = adaclassifier.predict_proba(Xtest)[:,1] #Calculate maximal significance True_Signal_test = prob_predict_test[ytest==1] True_Bkg_test = prob_predict_test[ytest==0] best_significance = 0 for x in np.linspace(0, 1, 1000): S = float(len(True_Signal_test[True_Signal_test>x])) B = float(len(True_Bkg_test[True_Bkg_test>x])) significance = S/np.sqrt(S+B) if significance > best_significance: best_significance = significance best_x = x best_S = S best_B = B print "\nCalculation with {} and {} done ".format(str(baseclassparameters), str(adaparameters)) print "Best significance of {0:.2f} archived when cutting at {1:.3f}".format(best_significance, best_x) print "Signal efficiency: {0:.2f}%".format(100.*best_S/len(True_Signal_test)) print "Background efficiency: {0:.2f}%".format(100.*best_B/len(True_Bkg_test)) print "Purity: {0:.2f}%".format(100.*best_S/(best_S+best_B)) queue.put( (best_significance, baseclassparameters, adaparameters) )
def cvalidate(): from sklearn import cross_validation trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:] X = np.array([x[1:8] for x in trainset]) y = np.array([x[8] for x in trainset]) #print X,y import math for i, x in enumerate(X): for j, xx in enumerate(x): if(math.isnan(xx)): X[i][j] = 26.6 #print X[0:3] #print y[0:3] X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0) X_train, X_test = decomposition_pca(X_train, X_test) bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200) bdt.fit(X_train, y_train) print bdt.score(X_test, y_test)
def experiment_estimators_AdaBoostRandomForest(): avgError = [] x_learners = [] rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False) for k_estimators in range(10,150,10): k = 10 skf = StratifiedKFold(labels,n_folds=k) averageError = 0.0 for train_index, test_index in skf: X_train, X_test = mfcc[:,train_index], mfcc[:,test_index] y_train, y_test = labels[train_index], labels[test_index] adb = AdaBoostClassifier(base_estimator=rf, n_estimators=k_estimators, learning_rate=0.01) adb.fit(X_train.T,y_train) y_pred = adb.predict(X_test.T) error = zero_one_loss(y_pred,y_test) print error averageError += (1./k) * error print "Average error: %4.2f%s" % (100 * averageError,'%') avgError.append(averageError) x_learners.append(k_estimators) # graph the errors now. plt.plot(x_learners, avgError) plt.ylabel('Average Error (k=10)') plt.xlabel('Number of Estimators') plt.title('Error as a function of the number of estimators') plt.show()
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) corpus = vector.transform(corpus) # Get the sample from the dataset sample = corpus.sample(False, 0.1).collect() X = [row['tfidf'] for row in sample] y = [row['label'] for row in sample] # Train a Scikit-Learn Model clf = AdaBoostClassifier() clf.fit(X, y) # Broadcast the Scikit-Learn Model to the cluster clf = sc.broadcast(clf) # Create accumulators for correct vs incorrect correct = sc.accumulator(0) incorrect = sc.accumulator(1) # Create the accuracy closure accuracy = make_accuracy_closure(clf, incorrect, correct) # Compute the number incorrect and correct corpus.foreachPartition(accuracy) accuracy = float(correct.value) / float(correct.value + incorrect.value) print("Global accuracy of model was {}".format(accuracy))
def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert_equal(proba.shape[1], len(classes)) assert_equal(clf.decision_function(iris.data).shape[1], len(classes)) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def main(): trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:] X = np.array([x[1:8] for x in trainset]) y = np.array([x[8] for x in trainset]) #print X,y import math for i, x in enumerate(X): for j, xx in enumerate(x): if(math.isnan(xx)): X[i][j] = 26.6 testset = np.genfromtxt(open('test.csv','r'), delimiter = ',')[1:] test = np.array([x[1:8] for x in testset]) for i, x in enumerate(test): for j, xx in enumerate(x): if(math.isnan(xx)): test[i][j] = 26.6 X, test = decomposition_pca(X, test) bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200) bdt.fit(X, y) print 'PassengerId,Survived' for i, t in enumerate(test): print '%d,%d' % (i + 892, int(bdt.predict(t)[0]))
def test_staged_predict(): """Check staged predictions.""" # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target) staged_scores = [s for s in clf.staged_score(iris.data, iris.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10) clf.fit(boston.data, boston.target) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target) staged_scores = [s for s in clf.staged_score(boston.data, boston.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
def test_pickle(): # Check pickability. import pickle # Adaboost classifier for alg in ['SAMME', 'SAMME.R']: obj = AdaBoostClassifier(algorithm=alg) obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_equal(score, score2) # Adaboost regressor obj = AdaBoostRegressor(random_state=0) obj.fit(boston.data, boston.target) score = obj.score(boston.data, boston.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(boston.data, boston.target) assert_equal(score, score2)
def adaBoost(n,x,t,x_test,t_test): clf = AdaBoostClassifier(n_estimators = n) clf.fit(x, t) predictions = clf.predict(x_test) X = confusion_matrix(t_test,predictions) classificationRate = (X[1,1]+X[0,0]) / sum(sum(X)) return(1-classificationRate)
def cook(): x, y, weights = load_data() n_components = 200 svd = TruncatedSVD(n_components, random_state=42) x_unweighted = svd.fit_transform(x) x_weighted = svd.fit_transform(weighted(x, weights)) for i in range(9): frac = 1 - (i * 0.01 + 0.01) print frac x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac) classifier = AdaBoostClassifier(n_estimators=100) classifier.fit(x_train, y_train) print "Unweighted: ", classifier.score(x_test, y_test) x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac) classifier = AdaBoostClassifier(n_estimators=100) classifier.fit(x_train, y_train) print "Weighted: ", classifier.score(x_test, y_test) print '--------------------------' '''
def train_classifiers(X_data, y_data): ############ Linear SVM: 0.908 ############# clf_LSVM = svm.SVC(kernel = 'linear') clf_LSVM.fit(X_data, y_data) ############ MultinomialNB: 0.875 ############# clf_MNB = MultinomialNB() clf_MNB.fit(X_data, y_data) ############ Random Forest: 0.910 ############# clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy') clf_RF.fit(X_data, y_data) ############ Extra Tree: 0.915 ################## clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_ETC.fit(X_data, y_data) ############ AdaBoost: 0.88 ################## clf_Ada = AdaBoostClassifier() clf_Ada.fit(X_data, y_data) ############ rbf SVM: 0.895 ############# clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf') clf_rbf.fit(X_data, y_data) ############ GradientBoosting: 0.88 ############# clf_GBC = GradientBoostingClassifier() clf_GBC.fit(X_data, y_data) return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC
def createAdaBoostClassifier(trainingVectors, targetValues): clf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) clf.fit(trainingVectors, targetValues, targetValues*10000) return(clf)
class DomainTypeClassifier(object): def __init__(self, radius, window_mode=False): self.classifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=20, learning_rate=1, algorithm="SAMME") # svm.SVC(kernel='rbf') self.radius = radius self.window_mode = window_mode def train(self, dataset): k = self.radius if not self.window_mode else 2 * self.radius + 1 rin, rout = dataset.getData(k, self.window_mode) print("fitting", len(rin)) self.classifier.fit(np.asarray(rin, float), np.asarray(rout, float)) def predict(self, ns): k = self.radius if not self.window_mode else 2 * self.radius + 1 to_predict = [] for i in range(len(ns)): if not self.window_mode: to_predict.append(encode(create_region(ns, i, k))) else: if i > len(ns) - k: break to_predict.append(encode(ns[i:i+k])) return int(Counter(self.classifier.predict( np.asarray(to_predict, float))).most_common(1)[0][0])
def cvalidate(): targetset = np.genfromtxt(open('trainLabels.csv','r'), dtype='f16') y = [x for x in targetset] trainset = np.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f16') X = np.array([x for x in trainset]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0) X_train, X_test = decomposition_pca(X_train, X_test) #SVM c_range = 10.0 ** np.arange(6.5,7.5,.25) gamma_range = 10.0 ** np.arange(-2.5,0.5,.25) parameters = {'kernel':['rbf'], 'C':c_range, 'gamma':gamma_range} svr = SVC() clf = grid_search.GridSearchCV(svr, parameters) clf.fit(X_train, y_train) bdt = AdaBoostClassifier(base_estimator = clf.best_estimator_, algorithm="SAMME", n_estimators=100) #bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=10)) bdt.fit(X_train, y_train) print bdt.score(X_test, y_test)
def ANGEL_training(cds_filename, utr_filename, output_pickle, num_workers=3): coding = [ r for r in SeqIO.parse(open(cds_filename), 'fasta') ] utr = [ r for r in SeqIO.parse(open(utr_filename), 'fasta') ] o_all = c_ORFscores.CDSWindowFeat() add_to_background(o_all, coding) add_to_background(o_all, utr) data_pos = get_data_parallel(o_all, coding, [0], num_workers) data_neg = get_data_parallel(o_all, utr, [0, 1, 2], num_workers) data = data_neg + data_pos target = [0]*len(data_neg) + [1]*len(data_pos) data = np.array(data) print >> sys.stderr, "data prep done, running classifier...." bdt = AdaBoostClassifier(n_estimators=50) bdt.fit(data, target) print >> sys.stderr, "classifier trained. putting pickle to", output_pickle with open(output_pickle, 'wb') as f: dump({'bdt':bdt, 'o_all':o_all}, f) return data, target, bdt
class AdaBoostcls(object): """docstring for ClassName""" def __init__(self): self.adaboost_cls = AdaBoostClassifier() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.adaboost_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.adaboost_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.adaboost_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def some(X, Y, X_test, Y_test): ada = AdaBoostClassifier() print "Train Model ---" t1 = time() ada.fit(X, Y) t2 = time() print "Model Trained ----------", t2 - t1 test_errors = [] cur = 1 Y_test2 = [] for k in Y_test: Y_test2.append(k[0]) print "Testing: " print Y_test2 pred = ada.predict(X_test) print pred accu = 1. - accuracy_score(y_true= Y_test2, y_pred= pred) print accu print "STAGED _____________" for test_predict in ( ada.staged_predict(X_test)): test_errors.append( 1. - accuracy_score(test_predict, Y_test2)) print "errorss : " print test_errors
def do_all_study(X,y): names = [ "Decision Tree","Gradient Boosting", "Random Forest", "AdaBoost", "Naive Bayes"] classifiers = [ #SVC(), DecisionTreeClassifier(max_depth=10), GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1), RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1), AdaBoostClassifier()] for name, clf in zip(names, classifiers): estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1) param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_GBC, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_GBC.fit(X_train,y_train) y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1] print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC)) clf_AB = AdaBoostClassifier() param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_AB, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_AB.fit(X_train,y_train) y_pred_AB = clf_AB.predict_proba(X_test)[:,1] print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
def ada(xtrain, ytrain, train_weight, tests, test_weight): #Initiate the training model clf = AdaBoostClassifier() mistakes = 0 cost = 0 #Fit the model clf.fit(xtrain, ytrain) vector_count = 0 #Iterate over the tests for i in range(len(tests)): #Get the number of elements in each test vector_count += len(tests[i]) test_count = 0 #Iterate over each feature in the tests for vector in tests[i]: #Predict based on each feature prediction = clf.predict(vector) #Determine the cost cost += test_weight[i][test_count] * pen[i][prediction[0]] #Count the number of mistakes if pen[i][prediction[0]] > 0: #print("Incorrectly Predicted " + str(Segments.reverse_mapping[i]) + " as " + str(Segments.reverse_mapping[prediction[0]])) mistakes += 1 test_count += 1 print("Number of mistakes: " + str(mistakes) + " of " + \ str(vector_count) + ", " + \ str((1.-float(mistakes)/float(vector_count))*100) + \ "% accurate") return cost
def ada_boost_dt(): """ Submission: ada_boost_dt_0707_03.csv E_val: 0.854350 E_in: 0.889561 E_out: 0.8832315976033993 """ from sklearn.ensemble import AdaBoostClassifier from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) ab = AdaBoostClassifier(n_estimators=300) scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1) logger.debug('CV: %s', scores) logger.debug('E_val: %f', sum(scores) / len(scores)) ab.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('ab', ab)]), 'ada_boost_dt_0707_03')
def prediction(feat,label): x_train, x_test, y_train, y_test = cross_validation.train_test_split(feat, label, test_size = 0.25, random_state = 0) num_leaves = [] accuracy_score = [] auc_score = [] # for depth in range(1,10): # clf = tree.DecisionTreeClassifier(max_depth = depth) # clf.fit(x_train,y_train) # predictions = clf.predict(x_test) # accuracy = clf.score(x_test,y_test) # auc = metrics.roc_auc_score(y_test,predictions) # num_leaves.append(depth) # accuracy_score.append(accuracy) # auc_score.append(auc) for depth in range(1,10): clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = depth), n_estimators = 100) clf.fit(x_train,y_train) predictions = clf.predict(x_test) accuracy = clf.score(x_test,y_test) auc = metrics.roc_auc_score(y_test,predictions) num_leaves.append(depth) accuracy_score.append(accuracy) auc_score.append(auc) return num_leaves,accuracy_score,auc_score
def ab_predictedValue(): print '----------AdaBoost----------' ab_clf = AdaBoostClassifier(n_estimators = NoOfEstimators) ab_clf.fit(train_df[features], train_df['SeriousDlqin2yrs']) ab_predictedValue = ab_clf.predict_proba(test_df[features]) print 'Feature Importance = %s' % ab_clf.feature_importances_ return ab_predictedValue[:,1]
def AB_results(): # AdaBoostClassifier print "--------------AdaBoostClassifier-----------------" rang = [60, 80] # print "--------------With HOG-----------------" # ans = [] # print "n_estimators Accuracy" # for i in rang: # clf = AdaBoostClassifier(n_estimators=i) # clf.fit(X_train_hog, y_train) # mean_accuracy = clf.score(X_test_hog, y_test) # print i, " ", mean_accuracy # ans.append('('+str(i)+", "+str(mean_accuracy)+')') # print ans # plt.plot(rang, ans, linewidth=2.0) # plt.xlabel("n_estimators") # plt.ylabel("mean_accuracy") # plt.savefig("temp_hog.png") print "\n--------------Without HOG-----------------" ans = [] print "n_estimators Accuracy" for i in rang: clf = AdaBoostClassifier(n_estimators=i) clf.fit(X_train, y_train) mean_accuracy = clf.score(X_test, y_test) print i, " ", mean_accuracy ans.append('('+str(i)+", "+str(mean_accuracy)+')') print ans plt.plot(rang, ans, linewidth=2.0) plt.xlabel("n_estimators") plt.ylabel("mean_accuracy") plt.savefig("temp_plain.png")
def ADA_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting AdaBoost Classifier***************") t0 = time() clf = AdaBoostClassifier(n_estimators=300) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("AdaBoost Classifier - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending AdaBoost Classifier***************") return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def runAdaBoost(arr):#depth, n_est, lrn_rate=1.0): # removing filename for the scipy optimise thing '''filename,''' #ada = AdaBoostClassifier(n_estimators=100) global file_dir, nEvents, solutionFile, counter print 'iteration number ' + str(counter) counter+=1 depth = int(arr[0]*100) n_est = int(arr[1]*100) lrn_rate = arr[2] if depth <= 0 or n_est <= 0 or lrn_rate <= 0: return 100 fname = 'ada_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) filename = fname ada = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=depth), algorithm="SAMME", n_estimators=n_est)#,n_jobs=4) print "AdaBoost training" ada.fit(sigtr[train_input].values,sigtr['Label'].values) print "AdaBoost testing" ada_pred = ada.predict(sigtest[train_input].values) solnFile(filename,ada_pred,sigtest['EventId'].values)# print "AdaBoost finished" # added for teh scipy optimise thing ams_score = ams.AMS_metric(solutionFile, file_dir+fname+'.out', nEvents) print ams_score logfile.write(fname + ': ' + str(ams_score)+'\n') return -1.0*float(ams_score) # since we are minimising
def runAdaReal(arr):#depth, n_est, filename, lrn_rate=1.0): global file_dir, nEvents, solutionFile, counter depth = int(arr[0]*100) n_est = int(arr[1]*100) lrn_rate = arr[2] print 'iteration number ' + str(counter) counter+=1 if depth <= 0 or n_est <= 0 or lrn_rate <= 0: print 'return 100' return 100 filename = 'adar_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) # low bdt_real = AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=depth), n_estimators=n_est, learning_rate=lrn_rate) print "AdaBoostReal training" bdt_real.fit(sigtr[train_input].values,sigtr['Label'].values) print "AdaBoostReal testing" bdt_real_pred = bdt_real.predict(sigtest[train_input].values) solnFile(filename,bdt_real_pred,sigtest['EventId'].values)# print "AdaBoostReal finished" ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents) print ams_score logfile.write(filename+': ' + str(ams_score)+'\n') return -1.0*float(ams_score)
def adaboost_skin(X_train, y_train, X_test, y_test): """Learn the skin data sets with AdaBoost. X_*: Samples. y_*: labels. """ print 'AdaBoost' min_iter = 1 max_iter = 200 steps = 30 diff = (max_iter - min_iter) / steps iterations = [min_iter + diff * step for step in xrange(steps+1)] scores = [] for T in iterations: clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=T) clf.fit(X_train.toarray(), y_train) scores.append(100 * clf.score(X_test.toarray(), y_test)) print '\t%d Iterations: %.2f%%' % (T, scores[-1]) return iterations, scores
class Model_Adaboost(object): def __init__(self,model,parameter = {"n_estimators" : 50, "CV_size": 0}): self.train = model.train self.test = model.test self.CVsize = float(parameter["CV_size"].get()) train = np.array(self.train) self.X_train = train[:, :-1] self.y_train = train[:, -1] self.X_train,self.X_CV,self.y_train,self.y_CV = train_test_split(self.X_train, self.y_train, test_size=self.CVsize) if self.CVsize == 0: self.clf = AdaBoostClassifier(n_estimators = int(parameter["n_estimators"].get())) self.model = model def fit(self): self.clf.fit(self.X_train,self.y_train) def score(self): pre = self.clf.predict(self.X_train) truth = self.y_train print ("score: " + str(self.clf.score(self.X_train,truth))) print ("f1: " + str(f1_score(truth,pre, average=None))) print ("AUC score: " + str(roc_auc_score(truth,pre))) def save_results(self): pre = self.model.clf.predict(self.model.test) df = pd.DataFrame({"predict":pre}) fileName = tkFileDialog.asksaveasfilename() df.to_csv(fileName) def crossValidation(self): estimatorList = [3,5,7,10,13,15,20,25,30,50] bestScore = [0,0] #score,n_estimator bestF1ScoreNeg = [0,0] bestF1ScorePos = [0,0] #bestAUCScore = [0,0] for e in estimatorList: self.clf = AdaBoostClassifier(n_estimators = e) self.clf.fit(self.X_train,self.y_train) pre = self.clf.predict(self.X_CV) truth = self.y_CV score = self.clf.score(self.X_CV,truth) if score > bestScore[0]: bestScore[0] = score bestScore[1] = e f1pos = f1_score(truth,pre, average=None)[1] if f1pos > bestF1ScorePos[0]: bestF1ScorePos[0] = f1pos bestF1ScorePos[1] = e f1neg = f1_score(truth,pre, average=None)[0] if f1neg > bestF1ScoreNeg[0]: bestF1ScoreNeg[0] = f1neg bestF1ScoreNeg[1] = e print ("Adaboost:") print ("Best [score,n_estimators] on Cross Validation set: " + str(bestScore)) print ("Best [f1(pos),n_estimators] on Cross Validation set: " + str(bestF1ScorePos)) print ("Best [f1(neg),n_estimators] on Cross Validation set" + str(bestF1ScoreNeg))
def selectAdaBoostClassifier(self, fit=0): clf = AdaBoostClassifier(n_estimators=self.estimatorSize) #scores = cross_val_score(clf, self.sparse_matrix, self.training_labels_list) #print "AdaBoostClassifier - %s" % scores.mean() clf.fit(self.sparse_matrix, self.training_labels_list) self.classify_test_data(clf, 'AdaBoost_%s' % (self.estimatorSize))
rfModel = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=0) rfModel.fit(np.array(pcaTrainData), np.array(trainLabels)) predictionsMade = rfModel.predict( np.array(pcaTestData)).tolist() tempAcc = met.accuracy(predictionsMade, testLabels) #tempF1 = met.precision_score(predictionsMade, testLabels) tempF1 = f1_score(testLabels, predictionsMade, average=None) rfAccuracy += tempAcc rfF1Score += tempF1 if (useADABoost): adaModel = AdaBoostClassifier(n_estimators=300) adaModel.fit(np.array(pcaTrainData), np.array(trainLabels)) predictionsMade = adaModel.predict( np.array(pcaTestData)).tolist() tempAcc = met.accuracy(predictionsMade, testLabels) #tempF1 = met.precision_score(predictionsMade, testLabels) tempF1 = f1_score(testLabels, predictionsMade, average=None) adaAccuracy += tempAcc adaF1Score += tempF1 if (useGradBoost): gradBoostModel = GradientBoostingClassifier(n_estimators=240, learning_rate=0.5, max_depth=21) gradBoostModel.fit(np.array(pcaTrainData), np.array(trainLabels))
y_test = y_test.replace("No", 0) y_test = y_test.replace("Yes", 1) print(X_train["TotalCharges"].dtypes) print(X_test["TotalCharges"].dtypes) # -------------- from sklearn.ensemble import AdaBoostClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Code starts here print(X_train.head()) print(X_test.head()) print(y_train.head()) print(y_test.head()) ada_model = AdaBoostClassifier(random_state=0) ada_model.fit(X_train, y_train) y_pred = ada_model.predict(X_test) ada_score = accuracy_score(y_pred, y_test) print(ada_score) ada_cm = confusion_matrix(y_pred, y_test) print(ada_cm) ada_cr = classification_report(y_pred, y_test) print(ada_cr) # -------------- from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV #Parameter list parameters = { 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
plt.show() #BOOSTING ALGORITHMS - ADABOOST # Import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier # Import AdaBoostClassifier from sklearn.ensemble import AdaBoostClassifier # Instantiate dt dt = DecisionTreeClassifier(max_depth=2, random_state=1) # Instantiate ada ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1) # Fit ada to the training set ada.fit(X_train, y_train) # Compute the probabilities of obtaining the positive class y_pred_proba = ada.predict_proba(X_test)[:, 1] # Import roc_auc_score from sklearn.metrics import roc_auc_score # Evaluate test-set roc_auc_score ada_roc_auc = roc_auc_score(y_test, y_pred_proba) # Print roc_auc_score print('ROC AUC score: {:.2f}'.format(ada_roc_auc)) #GRADIENT BOOSTING ENSEMBLE # Import GradientBoostingRegressor
x_train, x_test, y_train, y_test = train_test_split(predictors_df,target_df, test_size = 0.2,random_state=7) #decision tree dt = DecisionTreeClassifier() #storing the classifer in dt dt.fit(x_train, y_train) #fitting te model dt.score(x_test, y_test) #checking the score like accuracy dt.score(x_train, y_train) #so our model is overfitting # Ada boosting ada = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=10, learning_rate=7) ada.fit(x_train,y_train) ada.score(x_test,y_test) ada.score(x_train,y_train) #building voting model # Splitting data into training and testing data set x_train, x_test, y_train, y_test = train_test_split(predictors_df,target_df, test_size = 0.2,random_state=7) from sklearn.ensemble import VotingClassifier # Voting Classifier from sklearn.linear_model import LogisticRegression # importing logistc regression
# per la stessa feature, normalizzo i dati del dataset di training.csv singola_feature_training = pd.DataFrame( label_encoder.transform(singola_feature_training)) dataframe_training = pd.concat( [dataframe_training, singola_feature_training], axis=1) # classi_target = pd.read_csv( r'C:\\Users\\FauxL\\Desktop\\Data Science\\Project\\training.csv', usecols=['Name']) classificatore = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=150, algorithm="SAMME.R", learning_rate=0.5) classificatore.fit( dataframe_training, classi_target.values.ravel()) #fitto con adaboost tutto dataset training predict = classificatore.predict(dataframe_training) with open('resultAda.csv', 'w') as csvFile: writer = csv.writer(csvFile, delimiter=' ') writer.writerows(predict) dfAda = predict csvFile.close() uniciAda, counteggioAda = np.unique(dfAda, return_counts=True) print(uniciAda, counteggioAda) print("\nRilevanza attributi Ada") for nameAda, scoreAda in zip(COLUMNS, classificatore.feature_importances_):
def perform_learning(train_features, test_features, f_output, local, gglobal, deep=False): print(train_features[0]) if (local and gglobal): train = [x[1:-1] for x in train_features] test = [x[1:-1] for x in test_features] elif (local and not gglobal): train = [x[1:5] for x in train_features] test = [x[1:5] for x in test_features] elif (not local and gglobal): train = [x[5:-1] for x in train_features] test = [x[5:-1] for x in test_features] # print train[0] train_tags = [x[-1] for x in train_features] test_tags = [x[-1] for x in test_features] train = z_scoring(train) test = z_scoring(test) print(len(train[0])) print(train[0]) if not deep: algos = ['adaBoost', 'RF', 'L-SVM', 'RBF-SVM', 'SGD'] # algos = ['RBF-SVM'] for algo in algos: print(algo) f_output.writelines(algo + '\n') if algo == 'adaBoost': clf = AdaBoostClassifier(n_estimators=100) if algo == 'RF': clf = RandomForestClassifier(n_estimators=1000, criterion="gini", min_samples_split=15, oob_score=True, class_weight='balanced', max_depth=3) if algo == 'L-SVM': clf = SVC(kernel='linear', class_weight="balanced", C=0.01, probability=True) if algo == 'RBF-SVM': clf = SVC(class_weight="balanced", C=0.01, probability=True) if algo == 'SGD': clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) # print train clf.fit(train, train_tags) if (algo == 'RF'): print(len(clf.feature_importances_)) print(clf.feature_importances_) f_output.writelines(str(clf.feature_importances_) + '\n') evaluate_auc(clf, test, test_tags, train, train_tags, f_output) else: print(train[0]) from keras.models import Sequential from keras.layers import Dense, Dropout from keras.regularizers import l2, l1_l2 clf = Sequential() clf.add( Dense(100, activation="relu", kernel_initializer="he_normal", input_dim=train.shape[1])) # self.classifier.add(Dropout(0.5)) # self.classifier.add(Dense(100, init='he_normal', activation='relu', W_regularizer=l2(0.5))) clf.add(Dropout(0.1)) clf.add( Dense(1, init='uniform', activation='sigmoid', W_regularizer=l1_l2(0.2))) clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) clf.fit(train, train_tags, validation_data=[test, test_tags], epochs=100, batch_size=10, verbose=2) evaluate_auc(clf, test, test_tags, train, train_tags, f_output)
Parameters = [] for max_depth in range(1,11): for n_estimators in range(1,31): ExtraTrees_model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth) ExtraTrees_model.fit(train_x, train_y) ExtraTrees_scores = np.mean(cross_val_score(ExtraTrees_model, train_x, train_y)) Parameters.append(max_depth) Parameters.append(n_estimators) Parameters.append(ExtraTrees_scores) score_table = pd.DataFrame(np.array(Parameters).reshape([-1,3]),columns=['max_depth','n_estimators','scores']) score_table = score_table.sort_values(['scores'],ascending=False)[0:10] #AdaBoosting DecisionTree_model = DecisionTreeClassifier(max_depth=3) AdaBoost_model = AdaBoostClassifier(base_estimator=DecisionTree_model) AdaBoost_model.fit(train_x, train_y) AdaBoost_scores = np.mean(cross_val_score(AdaBoost_model, train_x, train_y)) Parameters = [] for max_depth in range(1,11): for n_estimators in range(1,31): DecisionTree_model = DecisionTreeClassifier(max_depth=max_depth) AdaBoost_model = AdaBoostClassifier(base_estimator=DecisionTree_model,n_estimators=n_estimators) AdaBoost_model.fit(train_x, train_y) AdaBoost_scores = np.mean(cross_val_score(AdaBoost_model, train_x, train_y)) Parameters.append(max_depth) Parameters.append(n_estimators) Parameters.append(AdaBoost_scores) score_table = pd.DataFrame(np.array(Parameters).reshape([-1,3]),columns=['max_depth','n_estimators','scores']) score_table = score_table.sort_values(['scores'],ascending=False)[0:10]
X_test, y_test = X[2000:], y[2000:] X_train, y_train = X[:2000], y[:2000] dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) dt_stump.fit(X_train, y_train) dt_stump_err = 1.0 - dt_stump.score(X_test, y_test) dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1) dt.fit(X_train, y_train) dt_err = 1.0 - dt.score(X_test, y_test) ada_discrete = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME") ada_discrete.fit(X_train, y_train) ada_real = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") ada_real.fit(X_train, y_train) fig = plt.figure() ax = fig.add_subplot(111) ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', label='Decision Stump Error') ax.plot([1, n_estimators], [dt_err] * 2, 'k--', label='Decision Tree Error') """
===================== AdaBoost算法属于ensemble算法的boosting分支 其核心思想就是将一些偏差比较大(比较容易欠拟合)的分类器进行组合 用随机的方式消除偏差同时减小偏差。 """ print(__doc__) from sklearn.ensemble import AdaBoostClassifier from sklearn.datasets import make_moons, make_circles, make_classification #引入训练数据 #X, y = make_circles(noise=0.2, factor=0.5, random_state=1) X, y = make_moons(noise=0.1, random_state=1) #定义AdaBoost分类器 adb = AdaBoostClassifier() #训练过程 adb.fit(X, y) #绘图库引入 import matplotlib.pyplot as plt import matplotlib as mpl import numpy as np #调整图片风格adbadb mpl.style.use('fivethirtyeight') #定义xy网格,用于绘制等值线图 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) #预测可能性 Z = adb.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape)
def AdaBoost(X_train, X_test, y_train, y_test): from sklearn.ensemble import AdaBoostClassifier abc = AdaBoostClassifier() abc.fit(X_train, y_train) print_report(abc, 'AdaBoost', X_train, X_test, y_train, y_test)
# Use scikit-learn's AdaBoostClassifier class. This class provides the functions to define and fit the model to your data. from sklearn.ensemble import AdaBoostClassifier model = AdaBoostClassifier() model.fit(x_train, y_train) model.predict(x_test) # Hyperparameters: # we can specify the hyperparameters. most common are: # base_estimator: The model utilized for the weak learners # (Warning: Don't forget to import the model that you decide to use for the weak learner). # n_estimators: The maximum number of weak learners used. # exemple, we define a model which uses decision trees of max_depth 2 as the weak learners, and it allows a maximum of 4 of them. from sklearn.tree import DecisionTreeClassifier model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=4)
def main(): data_percentage_array = np.linspace(0.1, 1, 10) full_data = load_data() x_train, x_test, y_train, y_test, X, y = split(full_data) dt = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=20, max_depth=5, min_samples_leaf=10, random_state=0) train_sizes, average_train_scores, average_test_scores = plot_learning_curve(dt, x_train, y_train, x_test, y_test, cv=10, train_sizes=data_percentage_array) rsultse = train_sizes, average_train_scores, average_test_scores plot = err_plot(np.linspace(0.1, 1, 10), average_train_scores, average_test_scores, "Decision Tree Classifier - Learning Curve") dt = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=4, max_leaf_nodes=18, max_depth=13, min_samples_leaf=14, random_state=1) dt = dt.fit(x_train,y_train) y_pred = dt.predict(x_test) print (metrics.classification_report(y_test, y_pred)) print ('train accuracy: {}'.format(dt.score(x_train, y_train))) print ('test accuracy: {}'.format(dt.score(x_test, y_test))) decision_Tree_matrix = metrics.confusion_matrix(y_test, y_pred) print (decision_Tree_matrix) start_time = time.time() #Use this one bdt_real = AdaBoostClassifier( DecisionTreeClassifier(criterion="entropy", min_samples_split=11, max_depth=19, min_samples_leaf=14, max_leaf_nodes=18, random_state=1), algorithm="SAMME", learning_rate=1) bdt_real.fit(x_train, y_train) print("--- %s seconds ---" % (time.time() - start_time)) a, average_train_scores, average_test_scores = plot_learning_curve(bdt_real, x_train, y_train, x_test, y_test, cv=10, train_sizes=data_percentage_array) plot = err_plot(np.linspace(0.1, 1, 10), average_train_scores, average_test_scores,"Decision Tree Not Pruned - Learning Curve") print ('train accuracy: {}'.format(bdt_real.score(x_train, y_train))) print ('test accuracy: {}'.format(bdt_real.score(x_test, y_test))) _pred = bdt_real.predict(x_test) print (metrics.confusion_matrix(y_test, _pred)) y_train_pred = bdt_real.predict(x_train) print ("test") print (metrics.classification_report(y_test, _pred, target_names=['No Diabetes', 'Diabetes'])) print ("train") print (metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes'])) column_names = ["preg","plas","pres","skin","insu","mass","pedi","age","class"] with open('data/pima-indians-diabetes copy.csv') as f: data = pandas.read_csv(f, sep=',', names=column_names) #knn has sensitvity to irrelevent features, after seeing theses results I deceided to look at the feature importance to see if this had a factor gbc = ensemble.GradientBoostingClassifier() gbc.fit(X, y) # Get Feature Importance from the classifier feature_importance = gbc.feature_importances_ # Normalize The Features feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure(figsize=(16, 12)) plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6') plt.yticks(pos, np.asanyarray(data.columns.tolist())[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show() bdt_discrete = AdaBoostClassifier( DecisionTreeClassifier(criterion="entropy", min_samples_split=11, max_depth=19, min_samples_leaf=14, max_leaf_nodes = 18, random_state=1), learning_rate=.3, algorithm="SAMME", n_estimators=10) bdt_discrete.fit(x_train, y_train) y_pred = bdt_discrete.predict(x_test) start_time = time.time() column_names = ["preg","plas","pres","skin","insu","mass","pedi","age","class"] with open('data/pima-indians-diabetes copy.csv') as f: data = pandas.read_csv(f, sep=',', names=column_names) X, y = data.iloc[:, :-1], data.iloc[:, -1] x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1) clf = svm.SVC(C=.1 ,kernel='linear', gamma = .001) clf.fit(x_train,y_train) learning_curve(clf,x_train,y_train) title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" estimator = svm.SVC(degree=1, C=1,kernel='poly') plt.show() a, average_train_scores, average_test_scores = plot_learning_curve(estimator, x_train, y_train, x_test, y_test, cv=10, train_sizes=data_percentage_array) err_plot(np.linspace(0.1, 1, 10), average_train_scores, average_test_scores, 'SVM Linear Kernel') train_sizes, average_train_scores, average_test_scores = get_learning_curve(estimator, x_train, y_train,x_test,y_test) plot = err_plot( train_sizes, average_train_scores, average_test_scores) estimator.fit(x_train, y_train) print 'train accuracy: {}'.format(estimator.score(x_train, y_train)) print 'test accuracy: {}'.format(estimator.score(x_test, y_test)) print("--- %s seconds ---" % (time.time() - start_time))
def build_model(): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100) model = AdaBoostClassifier() result = model.fit(X_train, y_train) return model
ncells_missclass = [] abscissa.append(j) print(j) for i in range(10): training_features = training_features_vector[i] testing_features = testing_features_vector[i] training_target = training_target_vector[i] testing_target = testing_target_vector[i] # Training with Bagging boosted = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=10) boosted.fit(training_features, training_target) # Comparing prediction with testing values prediction = boosted.predict(testing_features) # Number of missclassified cells lst = [item[j] for item in prediction] testing_target = np.array(testing_target) lst3t = [item[j] for item in testing_target] ncells_missclass.append(int(150 * mean_squared_error(lst, lst3t))) location.append(np.argmin(ncells_missclass[0:10])) minimum.append(min(ncells_missclass)) print(location)
#feature vectors for training emails & its labels train_labels = np.zeros(702) train_labels[351:701] = 1 train_matrix = extract_features(train_dir) #training SVM, Ensemble and Naive Bayes classifiers model1 = LinearSVC() model2 = MultinomialNB() model3 = RandomForestClassifier() model4 = AdaBoostClassifier() model1.fit(train_matrix, train_labels) model2.fit(train_matrix, train_labels) model3.fit(train_matrix, train_labels) model4.fit(train_matrix, train_labels) #test unseen mail for spam test_dir = 'C:\\Users\\100557540\\Google Drive\\Pranav\\IITROPAR\\Dr. Puneet Goyal\\Main papers to implement\\text mining for phising email detetcion\\test-mails' test_matrix = extract_features(test_dir) test_labels = np.zeros(260) test_labels[130:260] = 1 result1 = model1.predict(test_matrix) result2 = model2.predict(test_matrix) result3 = model3.predict(test_matrix) result4 = model4.predict(test_matrix) #confusion matrix for SVM and Naive Bayes models print(confusion_matrix(test_labels, result1)) print(confusion_matrix(test_labels, result2))
testX = [r[1:] for r in testset] testY = [r[0] for r in testset] for max_depth in x_list: newOut = [] for max_leaf_nodes in y_list: fileout = "MaxDepth" + str(max_depth)+"MaxLeaves" + str(max_leaf_nodes) clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=max_depth, max_leaf_nodes=max_leaf_nodes) clf = AdaBoostClassifier(base_estimator=clf, n_estimators=1000, learning_rate=1.0, algorithm='SAMME.R') # clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(), random_state=1, max_iter=1000) # clf = svm.SVC(gamma='scale') newTrainY = [] for r in trainY: newTrainY = newTrainY + [[r]] clf = clf.fit((trainX), np.array(newTrainY).ravel()) newOut += [accuracy_score(testY, clf.predict(testX))] csvout += [[str(max_depth), str(max_leaf_nodes), str(accuracy_score(testY, clf.predict(testX)))]] print(accuracy_score(testY, clf.predict(testX))) print(str(accuracy_score(testY, clf.predict(testX), normalize=False)) + " correct of " + str(len(testX))) filename = "../letter-classification/boosting/" + fileout + "/output.txt" os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, "w") as f: f.write(str(accuracy_score(testY, clf.predict(testX))) + "\n") f.write(str(accuracy_score(testY, clf.predict(testX), normalize=False)) + " correct of " + str(len(testX))) def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): """
knn_opt.fit(train_x, train_y) test_y_knn = knn_opt.predict_proba(test_x) knn_out = submission knn_out['target'] = test_y_knn knn_out['target'] = 1 - knn_out['target'] knn_out.to_csv('knn_predictions1.csv', index=False, float_format='%.4f') ada_opt = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=200, random_state=None) ada_opt.fit(train_x, train_y) test_y_ada = ada_opt.predict_proba(test_x) ada_out = submission ada_out['target'] = test_y_ada ada_out['target'] = 1 - ada_out['target'] ada_out.to_csv('ada_predictions1.csv', index=False, float_format='%.4f') gb_opt = GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None,
clf_svm.fit(train_X, train_Y) output_svm = clf_svm.predict(test_X) print('GradientBoosting model prediction') clf_GB = GradientBoostingClassifier() clf_GB.fit(train_X, train_Y) output_GB = clf_GB.predict(test_X) print('RandomForest model prediction') clf_RF = RandomForestClassifier() clf_RF.fit(train_X, train_Y) output_RF = clf_RF.predict(test_X) print('AdaBoost model prediction') clf_Ada = AdaBoostClassifier() clf_Ada.fit(train_X, train_Y) output_Ada = clf_Ada.predict(test_X) True_Positive = 0 True_Negative = 0 False_Positive = 0 False_Negative = 0 for i in range(len(output_svm)): ensemble_output = float(output_svm[i] + output_Ada[i] + output_RF[i] + output_GB[i]) / 4 if (ensemble_output >= 0.5): ensemble_output = 1 else: ensemble_output = 0
kfold = model_selection.KFold(n_splits=10, random_state=seed) model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) results = model_selection.cross_val_score(model, X, Y, cv=kfold) print(results.mean()) # In[7]: X=df.drop('electricity_consumption_category', axis=1) y=df['electricity_consumption_category'] from sklearn.cross_validation import train_test_split #分割訓練和測試集 X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=101) from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=100) clf.fit(X_train, y_train) y_pred= clf.predict(X_test) print(y_pred) # In[8]: for estimator in clf.estimators_: print(estimator.predict(X_test)) print(clf.estimator_errors_)
X_train, X_test, y_train, y_test = train_test_split(df_all.drop( "label", axis=1), df_all["label"], test_size=0.2) rf_model = RandomForestClassifier() adb_model = AdaBoostClassifier() et_model = ExtraTreesClassifier() lgb_model = lgb.LGBMClassifier() lr_model = LogisticRegression() gbdt_model = GradientBoostingClassifier() Dt_model = DecisionTreeClassifier() rf_model.fit(X_train, y_train) adb_model.fit(X_train, y_train) et_model.fit(X_train, y_train) lgb_model.fit(X_train, y_train) lr_model.fit(X_train, y_train) gbdt_model.fit(X_train, y_train) Dt_model.fit(X_train, y_train) print( 'rf_model:', round( metrics.f1_score(y_test, rf_model.predict(X_test), average='weighted'), 4)) print( 'adb_model:', round( metrics.f1_score(y_test,
# 数组拼接 X = np.concatenate((X1, X2)) # 500*2 # print(np.shape(X1), np.shape(X2), np.shape(X)) y = np.concatenate((y1, - y2 + 1)) # Create and fit an AdaBoosted decision tree # SAMME和SAMME.R。两者的主要区别是弱学习器权重的度量,SAMME使用了和我们的原理篇里二元分类Adaboost算法的扩展, # 即用对样本集分类效果作为弱学习器权重,而SAMME.R使用了对样本集分类的预测概率大小来作为弱学习器权重。 # 由于SAMME.R使用了概率度量的连续值,迭代一般比SAMME快,因此AdaBoostClassifier的默认算法algorithm的值也是SAMME.R。 # 我们一般使用默认的SAMME.R就够了,但是要注意的是使用了SAMME.R, 则弱分类学习器参数base_estimator必须限制使用支持概率预测的分类器。 # SAMME算法则没有这个限制。 bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) bdt.fit(X, y) plot_colors = "br" plot_step = 0.02 class_names = "AB" plt.figure(figsize=(10, 5)) # Plot the decision boundaries plt.subplot(121) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # X shape 500 * 2 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) # # ravel() 和 flatten()函数,将多维数组降为一维,ravel返回视图,flatten返回拷贝
print("\n\n\n") for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_): print(name, score) #%% AdaBoost from sklearn.ensemble import AdaBoostClassifier ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5) ada_clf.fit(X_train, y_train) y_pred = ada_clf.predict(X_test) print("\n\nAccuracy: ", ada_clf.__class__.__name__, " : ", accuracy_score(y_test, y_pred)) #%% Gradient Boosting from sklearn.ensemble import GradientBoostingRegressor gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0) gbrt.fit(X, y)
X=df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values le=LabelEncoder() y=le.fit_transform(y) X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) tree=DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=1) ada=AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=1) tree=tree.fit(X_train, y_train) y_train_pred=tree.predict(X_train) y_test_pred=tree.predict(X_test) tree_train=accuracy_score(y_train, y_train_pred) tree_test=accuracy_score(y_test, y_test_pred) print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test)) ada=ada.fit(X_train, y_train) y_train_pred=ada.predict(X_train) y_test_pred=ada.predict(X_test) ada_train=accuracy_score(y_train, y_train_pred) ada_test=accuracy_score(y_test, y_test_pred) print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test)) # plotting the decision regions x_min=X_train[:,0].min()-1 x_max=X_train[:,0].max()+1 y_min=X_train[:,1].min()-1 y_max=X_train[:,1].max()+1 xx, yy=np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8,3)) for idx, clf, tt in zip([0,1], [tree,ada], ['Decision Tree', 'AdaBoost']): clf.fit(X_train, y_train)
dct_cr_zigzag.append(diag_cr) flip = not flip return np.concatenate([ np.concatenate(dct_y_zigzag), np.concatenate(dct_cb_zigzag), np.concatenate(dct_cr_zigzag) ]) actual = [ "Cubism", "Impressionism", "Pop Art", "Pop Art", "Impressionism", "Cubism", "Cubism", "Cubism", "Impressionism", "Pop Art", "Pop Art", "Impressionism", "Realism", "Realism", "Realism", "Realism" ] model = AdaBoostClassifier(n_estimators=200) model.fit(training_data, responses) i = 0 image_lbp = [] image_csd = [] data = [] labels = [] for filename in glob.glob( 'C:\Users\dutta\Desktop\Project\Images\Test\T/*.jpg'): #assuming gif image = cv2.imread(filename, 0) image_lbp.append(image) img = cv2.imread(filename) image_csd.append(img) for img1, img2 in zip(image_lbp, image_csd): h = localbinarypattern(img1)
# print different accuracy measurements print 'accuracy score: ', logit.score(x_test, y_test) print 'precision:', precision_score(y_test, logit.predict(x_test), average='weighted') print 'recall:', recall_score(y_test, logit.predict(x_test), average='weighted') print 'mean cross validation score:', np.mean( cross_val_score(logit, pd.concat([x_train, x_test]), pd.concat([y_train, y_test]))) # apply adaboost for no of estimators 1 to 20 and print accuracies for i in range(1, 20): clf = AdaBoostClassifier(n_estimators=i, base_estimator=logit) clf.fit(x_train, y_train) score = clf.score(x_test, y_test) precision = precision_score(y_test, clf.predict(x_test), average='weighted') recall = recall_score(y_test, clf.predict(x_test), average='weighted') cross_val_mean = np.mean( cross_val_score(clf, pd.concat([x_train, x_test]), pd.concat([y_train, y_test]))) print i, score, precision, recall, cross_val_mean # initial values accuracy, precision, recall, cross validation mean # 0.741935483871 0.814616332408 0.741935483871 0.724832214765 # after boosting
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.ensemble import AdaBoostClassifier from time import time clf = AdaBoostClassifier() t_train = time() clf.fit(features_train, labels_train) print "Training time: %f s." % round(time() - t_train, 3) t_pred = time() pred = clf.predict(features_test) print "Predicting time: %f s." % round(time() - t_pred, 3) from sklearn.metrics import accuracy_score acc = accuracy_score(pred, labels_test) print "Accuracy: %f. " % acc try: prettyPicture(clf, features_test, labels_test) except NameError: pass
'GaussianNB', 'KNeighborsClassifier', 'LogisticRegression'] # %% for index,model in enumerate(models): try: model.fit(trainX_tf,trainY_tf) print(modelnames[index],"Accuracy =",round(model.score(testX_tf,testY_tf)*100,2),"%") except: print("Skipped",modelnames[index]) # %% # base_estimator=DecisionTreeClassifier ab = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=1000) ab.fit(trainX_tf,trainY_tf) print('AdaBoost Accuracy with Decision Tree (Scaled Data)= ',(ab.score(testX_tf,testY_tf)*100)) # %% # base_estimator=RandomForest ab = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=1000,random_state=10),n_estimators=1000) ab.fit(trainX_tf,trainY_tf) print('AdaBoost Accuracy with Random Forest (Scaled Data)= ',(ab.score(testX_tf,testY_tf)*100)) # %% # base_estimator=LogisticRegression ab = AdaBoostClassifier(base_estimator=LogisticRegression(max_iter=1000,solver = 'lbfgs'),n_estimators=1000) ab.fit(trainX_tf,trainY_tf) print('AdaBoost Accuracy with Logistic Reg (Scaled Data)= ',(ab.score(testX_tf,testY_tf)*100)) # %%
#KNN from sklearn.neighbors import KNeighborsClassifier rf6 = KNeighborsClassifier() rf6.fit(X_train, y_train) y_val_pred6 = rf6.predict_proba(X_val) y_val_pred_acc6 = rf6.predict(X_val) print(log_loss(y_val, y_val_pred6)) print(accuracy_score(y_val, y_val_pred_acc6)) #AdaBoost from sklearn.ensemble import AdaBoostClassifier rf7 = AdaBoostClassifier(n_estimators=250) rf7.fit(X_train, y_train) y_val_pred6 = rf7.predict_proba(X_val) y_val_pred_acc7 = rf7.predict(X_val) print(log_loss(y_val, y_val_pred7)) print(accuracy_score(y_val, y_val_pred_acc7)) #Compare ROC of each Algorithm import matplotlib.pyplot as plt from sklearn import metrics #RandomForest fpr1, tpr1, threshold1 = metrics.roc_curve(y_val_pred_acc1, y_val_pred1) roc_auc1 = metrics.auc(fpr1, tpr1) plt.title('ROC of RandomForest') plt.plot(fpr1, tpr1, 'b', label='AUC = %0.2f' % roc_auc1) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--')
result_file.write(f"Data ratio: {DATA_FRAC}\n") result_file.write(f"Test ratio: {TEST_SIZE}\n") result_file.write( f"Sample size (test-train): {len(test_X)}-{len(train_X)}\n") result_file.write(f"Threshold: {THRESHOLD}\n") result_file.write(f"\n#,parameters,accuracy,finish_time(s)\n") #--- C and GAMMA TRAINING STARTS ---# i = 0 for n in PARAMETERS: print(f"{i}:\t{n}\tstarting...") clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=2), **n) t0 = time.time() clf.fit(train_X, train_y) # training dT = time.time() - t0 test_results = clf.predict(test_X) # test acc = float(np.count_nonzero(test_results == test_y)) / len(test_y) if WRITE_FILE: result_file.write(f"{i},{n},{acc:.5f},{dT:.2f}\n") result_file.flush() i += 1 train_data, test_data = model_selection.train_test_split( tweets, test_size=TEST_SIZE) train_X = np.array(train_data[X_column_names]) train_y = np.array(train_data[Y_column_name])