class Ensemble: def __init__(self, data): self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy') self.lda = LDA() self.dec = DecisionTreeClassifier(criterion='entropy') self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25) self.make_prediction(data) def make_prediction(self, data): ''' Make an ensemble prediction ''' self.rf.fit(data.features_train, data.labels_train) self.lda.fit(data.features_train, data.labels_train) self.dec.fit(data.features_train, data.labels_train) self.ada.fit(data.features_train, data.labels_train) pre_pred = [] self.pred = [] ada_pred = self.ada.predict(data.features_test) rf_pred = self.rf.predict(data.features_test) lda_pred = self.lda.predict(data.features_test) dec_pred = self.dec.predict(data.features_test) for i in range(len(rf_pred)): pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ]) for entry in pre_pred: pred_list = sorted(entry, key=entry.count, reverse=True) self.pred.append(pred_list[0])
def Adaboost(TrainData,TestData): features=['Time','Season','Hour','Minute','District'] clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30) size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for i in range(0,len(size)): train,validation= train_test_split(TrainData, train_size=size[i]) while len(set(train['Category'])) != len(set(validation['Category'])): train,validation= train_test_split(TrainData, train_size=size[i]) clf = clf.fit(train[features], train['Category']) """stop = timeit.default_timer() print "Runnin time adaboost is ", stop-start""" predicted=np.array(clf.predict_proba(validation[features])) model=clf.predict(train[features]) model1=clf.predict(validation[features]) #scores = cross_val_score(clf, validation[features], validation['Category']) #print "Scores mean is",scores.mean() #accuracy print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model) print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1) print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro') print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None) #writing to file """Category_new=[]
def test_classifiers2(data, ind): from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=100) clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1]) print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1]) out = clf.predict(data[ind[1000:], :-1]) print(confusion_matrix(data[ind[1000:], -1], out)) from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1]) print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1]) out = clf.predict(data[ind[1000:], :-1]) print(confusion_matrix(data[ind[1000:], -1], out)) from sklearn.neural_network import MLPClassifier clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1) clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1]) print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1]) out = clf.predict(data[ind[1000:], :-1]) print(confusion_matrix(data[ind[1000:], -1], out)) import xgboost as xgb xgb_model = xgb.XGBClassifier().fit(data[ind[:1000], :-1], data[ind[:1000], -1]) out = xgb_model.predict(data[ind[1000:], :-1]) a = confusion_matrix(data[ind[1000:], -1], out) print float(a[0, 0] + a[1, 1]) / np.sum(a) print a
def test_staged_predict(): """Check staged predictions.""" # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target) staged_scores = [s for s in clf.staged_score(iris.data, iris.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10) clf.fit(boston.data, boston.target) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target) staged_scores = [s for s in clf.staged_score(boston.data, boston.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
class Model_Adaboost(object): def __init__(self,model,parameter = {"n_estimators" : 50, "CV_size": 0}): self.train = model.train self.test = model.test self.CVsize = float(parameter["CV_size"].get()) train = np.array(self.train) self.X_train = train[:, :-1] self.y_train = train[:, -1] self.X_train,self.X_CV,self.y_train,self.y_CV = train_test_split(self.X_train, self.y_train, test_size=self.CVsize) if self.CVsize == 0: self.clf = AdaBoostClassifier(n_estimators = int(parameter["n_estimators"].get())) self.model = model def fit(self): self.clf.fit(self.X_train,self.y_train) def score(self): pre = self.clf.predict(self.X_train) truth = self.y_train print ("score: " + str(self.clf.score(self.X_train,truth))) print ("f1: " + str(f1_score(truth,pre, average=None))) print ("AUC score: " + str(roc_auc_score(truth,pre))) def save_results(self): pre = self.model.clf.predict(self.model.test) df = pd.DataFrame({"predict":pre}) fileName = tkFileDialog.asksaveasfilename() df.to_csv(fileName) def crossValidation(self): estimatorList = [3,5,7,10,13,15,20,25,30,50] bestScore = [0,0] #score,n_estimator bestF1ScoreNeg = [0,0] bestF1ScorePos = [0,0] #bestAUCScore = [0,0] for e in estimatorList: self.clf = AdaBoostClassifier(n_estimators = e) self.clf.fit(self.X_train,self.y_train) pre = self.clf.predict(self.X_CV) truth = self.y_CV score = self.clf.score(self.X_CV,truth) if score > bestScore[0]: bestScore[0] = score bestScore[1] = e f1pos = f1_score(truth,pre, average=None)[1] if f1pos > bestF1ScorePos[0]: bestF1ScorePos[0] = f1pos bestF1ScorePos[1] = e f1neg = f1_score(truth,pre, average=None)[0] if f1neg > bestF1ScoreNeg[0]: bestF1ScoreNeg[0] = f1neg bestF1ScoreNeg[1] = e print ("Adaboost:") print ("Best [score,n_estimators] on Cross Validation set: " + str(bestScore)) print ("Best [f1(pos),n_estimators] on Cross Validation set: " + str(bestF1ScorePos)) print ("Best [f1(neg),n_estimators] on Cross Validation set" + str(bestF1ScoreNeg))
def AdaBoost(xtrain, xtest, ytrain, ytest): depth=75 model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=depth) model.fit(xtrain, ytrain) print 'Adaboost with depth %d' %depth print 'Test Performance' eval(ytest, model.predict(xtest)) print 'Train Performance' eval(ytrain, model.predict(xtrain))
def ada_boost(X,y, nf = 2, ne = 50, lr=1): y = y.astype(float) Xs = X.astype(float) col_names = X.columns Xs_t, Xs_holdout, y_t, y_holdout = train_test_split(Xs, y, train_size=.8) Xs_t = Xs_t.set_index([range(len(Xs_t))]) Xs_holdout = Xs_holdout.set_index([range(len(Xs_holdout))]) y_t = pd.DataFrame(y_t).set_index([range(len(y_t))]) y_holdout = pd.DataFrame(y_holdout).set_index([range(len(y_holdout))]) kf = KFold(len(Xs_t), nf) output_table = [] precisions = [] accuracies = [] F1s = [] fold_count = 1 for train_index, test_index in kf: results = [] Xs_train, Xs_test = Xs_t.iloc[train_index,:], Xs_t.iloc[test_index,:] y_train, y_test = y_t.iloc[train_index,:], y_t.iloc[test_index,:] y_train = np.array(y_train) y_test = np.array(y_test) my_ada = AdaBoostClassifier(n_estimators=ne, learning_rate = lr) my_ada.fit(Xs_train, y_train) pred = my_ada.predict(Xs_test) pred = np.array(pred) output_table.append(' ') output_table.append("Fold "+ str(fold_count) + ':') output_table.append("Precision Score: "+str(precision_score(pred, y_test))) output_table.append("Accuracy Score: "+ str(accuracy_score(pred, y_test))) output_table.append("F1 Score: "+str(f1_score(pred, y_test))) precisions.append(precision_score(pred, y_test)) accuracies.append(accuracy_score(pred, y_test)) F1s.append(f1_score(pred, y_test)) fold_count += 1 pred_holdout = my_ada.predict(Xs_holdout) pred_holdout = np.array(pred_holdout) cm = confusion_matrix(y_holdout, pred_holdout) TN = cm[0][0] FN = cm[0][1] TP = cm[1][1] FP = cm[1][0] print "Mean Precision: ", np.mean(precisions) print "Mean F1s: ", np.mean(F1s) print "True Positive Rate (Sensitivity): ", TP*1./(TP+FN)#cm[1][1]*1./(cm[1][1]+cm[0][1]) print "True Negative Rate (Specificity): ", TN*1./(TN+FP)#cm[0][0]*1./(cm[0][0]+cm[1][0]) print "Precision: ", TP*1./(TP+FP), #precision_score(pred_holdout, y_holdout) print "Accuracy: ", (TP+TN)*1./(TP+TN+FP+FN), #accuracy_score(pred_holdout, y_holdout) indices = np.argsort(my_ada.feature_importances_) figure = plt.figure(figsize=(10,7)) plt.barh(np.arange(len(col_names)), my_ada.feature_importances_[indices], align='center', alpha=.5) plt.yticks(np.arange(len(col_names)), np.array(col_names)[indices], fontsize=14) plt.xticks(fontsize=14) _ = plt.xlabel('Relative importance', fontsize=18) return my_ada
def eval(ds, testNum, p, splitProportion=0.2): #testNum=1 #splitProportion=0.2 allFeaturesF1=[] allFeaturesRecall=[] allFeaturesPrecision=[] featureSelctedF1=[] featureSelctedRecall = [] featureSelctedPrecision = [] for _ in range(testNum): tstdata, trndata = ds.splitWithProportion( splitProportion ) X, Y = labanUtil.fromDStoXY(trndata) X_test, Y_test = labanUtil.fromDStoXY(tstdata) #localF1s = [] #localRecalls = [] #localPercisions = [] for y, y_test in zip(Y, Y_test): if all(v == 0 for v in y): continue #clf = LinearSVC()#fit_intercept=True, C=p) #clf.sparsify() #clf = RandomForestClassifier()#criterion='entropy') #clf = tree.DecisionTreeClassifier()#max_depth=p) clf = AdaBoostClassifier() #clf = GradientBoostingClassifier()#, learning_rate=lr) #clf = ExtraTreesClassifier(n_estimators=p) #svc = LinearSVC() #selector = RFE(estimator=svc, n_features_to_select=p*19, step=0.2) selector = SelectPercentile(chooser, percentile=p) selector.fit(X, y) name = str(clf).split()[0].split('(')[0] clf.fit(selector.transform(X), y) pred = clf.predict(selector.transform(X_test)) featureSelctedF1.append(metrics.f1_score(y_test, pred)) featureSelctedRecall.append(metrics.recall_score(y_test, pred)) featureSelctedPrecision.append(metrics.precision_score(y_test, pred)) clf.fit(X, y) pred = clf.predict(X_test) allFeaturesF1.append(metrics.f1_score(y_test, pred)) allFeaturesRecall.append(metrics.recall_score(y_test, pred)) allFeaturesPrecision.append(metrics.precision_score(y_test, pred)) return np.mean(allFeaturesF1), np.mean(featureSelctedF1), \ np.mean(allFeaturesRecall), np.mean(featureSelctedRecall), \ np.mean(allFeaturesPrecision), np.mean(featureSelctedPrecision), \ name
def AdaBC(train,test,train_target,test_target,weights=None, n=500, lr = 1): abc = AdaBoostClassifier(n_estimators = n, learning_rate = lr) abc.fit(train, train_target, sample_weight = weights) res = abc.predict(train) print '*************************** AdaBC ****************' print classification_report(train_target,res) res1 = abc.predict(test) print classification_report(test_target,res1) return abc
def test_adaboost_classifier(train_test_sets): """ Adaboost Classifier with Decision Tree Stumps. """ X_train, X_test, y_train, y_test = train_test_sets clf = AdaBoostClassifier(n_estimators=100, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_train) print "ADABOOST CLASSIFIER RESULTS" print "\tTraining accuracy is ", metrics.accuracy_score(y_train, y_pred, normalize=True) y_pred = clf.predict(X_test) print_metrics(y_test, y_pred)
def perform_emsamble_model(): #get data from csv file x , y_votes, y_comments, y_views, lat = read_train_data() #transform to nunpy data type array for better usage y_votes = np.array(y_votes) y_comments = np.array(y_comments) y_views = np.array(y_views) #get test data x_test, ids, lat = read_test_data() #Change the parameters from the objects with the values from gridsearch vec_votes = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2) vec_comments = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2) vec_views = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2) #transfor x and x_test in a TFIDF matrix for feeding to the classifier x_votes = vec_votes.fit_transform(x) x_comments = vec_comments.fit_transform(x) x_views = vec_views.fit_transform(x) x_test_transformed_votes = vec_votes.transform(x_test) x_test_transformed_comments = vec_comments.transform(x_test) x_test_transformed_views = vec_views.transform(x_test) print "TFIDF Matrixes generated" print " LSA transforming" lsa_votes = TruncatedSVD(500) lsa_comments = TruncatedSVD(500) lsa_views = TruncatedSVD(500) x_votes = lsa_votes.fit_transform(x_votes) print "LSA Votes Done.." print x_comments = lsa_comments.fit_transform(x_comments) print "LSA Comments Done.." print x_views = lsa_views.fit_transform(x_views) print "LSA Views Done.." print x_test_transformed_votes = lsa_votes.transform(x_test_transformed_votes) x_test_transformed_comments = lsa_comments.transform(x_test_transformed_comments) x_test_transformed_views = lsa_views.transform(x_test_transformed_views) print "SLA Finished.." ada_votes = AdaBoostClassifier(base_estimator=RandomForestClassifier()) ada_comments = AdaBoostClassifier(base_estimator=RandomForestClassifier()) ada_views = AdaBoostClassifier(base_estimator=RandomForestClassifier()) ada_votes.fit(x_votes, y_votes) ada_comments.fit(x_comments, y_comments) ada_views.fit(x_views, y_views) print "Fitting done" print #predict number of votes pred_votes = ada_votes.predict(x_test_transformed_votes) pred_comments = ada_comments.predict(x_test_transformed_comments) pred_views = ada_views.predict(x_test_transformed_views) #generate submission response csv file create_csv_response(len(x_test), ids, pred_views, pred_votes, pred_comments)
def AdaBoost(X, Y, XTest, YTest): print '-----------------------------------------------------' # param_grid = {'learning_rate': [0.1, 0.3, 0.6, 1, 3, 6, 10]} # tree_grid = GridSearchCV(AdaBoostClassifier(), param_grid) tree_grid = AdaBoostClassifier(n_estimators=100, learning_rate=2) tree_grid.fit(X, Y) # print("The best parameters are %s with a score of %0.2f" # % (tree_grid.best_params_, tree_grid.best_score_)) print "Computing training statistics" dtree_predict_time_training = time.time() Ypred_dtree_training = tree_grid.predict(X) dtree_predict_time_training = time.time() - dtree_predict_time_training dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training) dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training, average='binary') dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training, average='binary') print "DT training prediction time: " + str(dtree_predict_time_training) print "DT training accuracy Score: " + str(dtree_accuracy_training) print "DT training precision Score: " + str(dt_precision_training) print "DT training recall Score: " + str(dtree_recall_training) print "Computing testing statistics" dtree_predict_time_test = time.time() Ypred_dtree_test = tree_grid.predict(XTest) dtree_predict_time_test = time.time() - dtree_predict_time_test dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test) dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test, average='binary') dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test, average='binary') print "DT test prediction time: " + str(dtree_predict_time_test) print "DT test accuracy Score: " + str(dtree_accuracy_test) print "DT test precision Score: " + str(dt_precision_test) print "DT test recall Score: " + str(dtree_recall_test) print "Creating ROC curve" y_true = YTest y_score = tree_grid.predict_proba(XTest) fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true, y_score=y_score[:, 0], pos_label=0) plt.plot(fprSVM, trpSVM, 'c-', label='ADA')
def experiment_estimators_AdaBoostRandomForest(): avgError = [] x_learners = [] rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False) for k_estimators in range(10,150,10): k = 10 skf = StratifiedKFold(labels,n_folds=k) averageError = 0.0 for train_index, test_index in skf: X_train, X_test = mfcc[:,train_index], mfcc[:,test_index] y_train, y_test = labels[train_index], labels[test_index] adb = AdaBoostClassifier(base_estimator=rf, n_estimators=k_estimators, learning_rate=0.01) adb.fit(X_train.T,y_train) y_pred = adb.predict(X_test.T) error = zero_one_loss(y_pred,y_test) print error averageError += (1./k) * error print "Average error: %4.2f%s" % (100 * averageError,'%') avgError.append(averageError) x_learners.append(k_estimators) # graph the errors now. plt.plot(x_learners, avgError) plt.ylabel('Average Error (k=10)') plt.xlabel('Number of Estimators') plt.title('Error as a function of the number of estimators') plt.show()
def Bootstrap_method(self): rs = cross_validation.ShuffleSplit( len(self.FeatureSet), 10, 0.25, random_state=0) clf = tree.DecisionTreeClassifier() for train_index, test_index in rs: X_train = [] X_test = [] y_train = [] y_test = [] for trainid in train_index.tolist(): X_train.append(self.FeatureSet[trainid]) y_train.append(self.Label[trainid]) for testid in test_index.tolist(): X_test.append(self.FeatureSet[testid]) y_test.append(self.Label[testid]) #clf = clf.fit(X_train, y_train) # pre_labels = clf.predict(X_test) clf = AdaBoostClassifier(n_estimators=100) clf = clf.fit(X_train, y_train) pre_labels = clf.predict(X_test) # Modeal Evaluation ACC = metrics.accuracy_score(y_test, pre_labels) MCC = metrics.matthews_corrcoef(y_test, pre_labels) SN = self.performance(y_test, pre_labels) print ACC,SN
def KFold_method(self): kf = KFold(n_splits=10) for train_index, test_index in kf.split(self.FeatureSet): X_train = [] X_test = [] y_train = [] y_test = [] for trainid in train_index.tolist(): X_train.append(self.FeatureSet[trainid]) y_train.append(self.Label[trainid]) for testid in test_index.tolist(): X_test.append(self.FeatureSet[testid]) y_test.append(self.Label[testid]) #clf = tree.DecisionTreeClassifier() #clf = clf.fit(X_train, y_train) #pre_labels = clf.predict(X_test) clf = AdaBoostClassifier(n_estimators=100) clf = clf.fit(X_train, y_train) pre_labels = clf.predict(X_test) # Modeal Evaluation ACC = metrics.accuracy_score(y_test, pre_labels) MCC = metrics.matthews_corrcoef(y_test, pre_labels) SN = self.performance(y_test, pre_labels) print ACC, SN
def classify(x, y, cv, n_estimator=50): acc, prec, recall = [], [], [] base_clf = DecisionTreeClassifier( compute_importances=None, criterion="entropy", max_depth=1, max_features=None, max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, random_state=None, splitter="best", ) global clf clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=n_estimator) for train, test in cv: x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test] clf = clf.fit(x_train, y_train) y_pred = clf.predict(x_test) acc.append(accuracy_score(y_test, y_pred)) prec.append(precision_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) a = np.mean(acc) p = np.mean(prec) r = np.mean(recall) f = 2 * p * r / (p + r) return a, p, r, f
def runAdaReal(arr):#depth, n_est, filename, lrn_rate=1.0): global file_dir, nEvents, solutionFile, counter depth = int(arr[0]*100) n_est = int(arr[1]*100) lrn_rate = arr[2] print 'iteration number ' + str(counter) counter+=1 if depth <= 0 or n_est <= 0 or lrn_rate <= 0: print 'return 100' return 100 filename = 'adar_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) # low bdt_real = AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=depth), n_estimators=n_est, learning_rate=lrn_rate) print "AdaBoostReal training" bdt_real.fit(sigtr[train_input].values,sigtr['Label'].values) print "AdaBoostReal testing" bdt_real_pred = bdt_real.predict(sigtest[train_input].values) solnFile(filename,bdt_real_pred,sigtest['EventId'].values)# print "AdaBoostReal finished" ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents) print ams_score logfile.write(filename+': ' + str(ams_score)+'\n') return -1.0*float(ams_score)
def runAdaBoost(arr):#depth, n_est, lrn_rate=1.0): # removing filename for the scipy optimise thing '''filename,''' #ada = AdaBoostClassifier(n_estimators=100) global file_dir, nEvents, solutionFile, counter print 'iteration number ' + str(counter) counter+=1 depth = int(arr[0]*100) n_est = int(arr[1]*100) lrn_rate = arr[2] if depth <= 0 or n_est <= 0 or lrn_rate <= 0: return 100 fname = 'ada_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) filename = fname ada = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=depth), algorithm="SAMME", n_estimators=n_est)#,n_jobs=4) print "AdaBoost training" ada.fit(sigtr[train_input].values,sigtr['Label'].values) print "AdaBoost testing" ada_pred = ada.predict(sigtest[train_input].values) solnFile(filename,ada_pred,sigtest['EventId'].values)# print "AdaBoost finished" # added for teh scipy optimise thing ams_score = ams.AMS_metric(solutionFile, file_dir+fname+'.out', nEvents) print ams_score logfile.write(fname + ': ' + str(ams_score)+'\n') return -1.0*float(ams_score) # since we are minimising
def plot_adaboost(): X, y = make_moons(noise=0.3, random_state=0) # Create and fit an AdaBoosted decision tree est = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R", n_estimators=200) sample_weight = np.empty(X.shape[0], dtype=np.float) sample_weight[:] = 1. / X.shape[0] est._validate_estimator() est.estimators_ = [] est.estimator_weights_ = np.zeros(4, dtype=np.float) est.estimator_errors_ = np.ones(4, dtype=np.float) plot_step = 0.02 # Plot the decision boundaries x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True) colors = ['#d7191c', '#fdae61', '#ffffbf', '#abd9e9', '#2c7bb6'] c = lambda a, b, c: map(lambda x: x / 254.0, [a, b, c]) colors = [c(215, 25, 28), c(253, 174, 97), c(255, 255, 191), c(171, 217, 233), c(44, 123, 182), ] for i, ax in enumerate(axes): sample_weight, estimator_weight, estimator_error = est._boost(i, X, y, sample_weight) est.estimator_weights_[i] = estimator_weight est.estimator_errors_[i] = estimator_error sample_weight /= np.sum(sample_weight) Z = est.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=matplotlib.colors.ListedColormap([colors[1], colors[-2]]), alpha=1.0) ax.axis("tight") # Plot the training points ax.scatter(X[:, 0], X[:, 1], c=np.array([colors[0], colors[-1]])[y], s=20 + (200 * sample_weight) ** 2, cmap=plt.cm.Paired) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xlabel('$x_0$') if i == 0: ax.set_ylabel('$x_1$') plt.tight_layout() plt.show()
class AdaBoostcls(object): """docstring for ClassName""" def __init__(self): self.adaboost_cls = AdaBoostClassifier() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.adaboost_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.adaboost_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.adaboost_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def main(): trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:] X = np.array([x[1:8] for x in trainset]) y = np.array([x[8] for x in trainset]) #print X,y import math for i, x in enumerate(X): for j, xx in enumerate(x): if(math.isnan(xx)): X[i][j] = 26.6 testset = np.genfromtxt(open('test.csv','r'), delimiter = ',')[1:] test = np.array([x[1:8] for x in testset]) for i, x in enumerate(test): for j, xx in enumerate(x): if(math.isnan(xx)): test[i][j] = 26.6 X, test = decomposition_pca(X, test) bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200) bdt.fit(X, y) print 'PassengerId,Survived' for i, t in enumerate(test): print '%d,%d' % (i + 892, int(bdt.predict(t)[0]))
class DomainTypeClassifier(object): def __init__(self, radius, window_mode=False): self.classifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=20, learning_rate=1, algorithm="SAMME") # svm.SVC(kernel='rbf') self.radius = radius self.window_mode = window_mode def train(self, dataset): k = self.radius if not self.window_mode else 2 * self.radius + 1 rin, rout = dataset.getData(k, self.window_mode) print("fitting", len(rin)) self.classifier.fit(np.asarray(rin, float), np.asarray(rout, float)) def predict(self, ns): k = self.radius if not self.window_mode else 2 * self.radius + 1 to_predict = [] for i in range(len(ns)): if not self.window_mode: to_predict.append(encode(create_region(ns, i, k))) else: if i > len(ns) - k: break to_predict.append(encode(ns[i:i+k])) return int(Counter(self.classifier.predict( np.asarray(to_predict, float))).most_common(1)[0][0])
def ADA_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting AdaBoost Classifier***************") t0 = time() clf = AdaBoostClassifier(n_estimators=300) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("AdaBoost Classifier - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending AdaBoost Classifier***************") return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def prediction(feat,label): x_train, x_test, y_train, y_test = cross_validation.train_test_split(feat, label, test_size = 0.25, random_state = 0) num_leaves = [] accuracy_score = [] auc_score = [] # for depth in range(1,10): # clf = tree.DecisionTreeClassifier(max_depth = depth) # clf.fit(x_train,y_train) # predictions = clf.predict(x_test) # accuracy = clf.score(x_test,y_test) # auc = metrics.roc_auc_score(y_test,predictions) # num_leaves.append(depth) # accuracy_score.append(accuracy) # auc_score.append(auc) for depth in range(1,10): clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = depth), n_estimators = 100) clf.fit(x_train,y_train) predictions = clf.predict(x_test) accuracy = clf.score(x_test,y_test) auc = metrics.roc_auc_score(y_test,predictions) num_leaves.append(depth) accuracy_score.append(accuracy) auc_score.append(auc) return num_leaves,accuracy_score,auc_score
def adaBoost(n,x,t,x_test,t_test): clf = AdaBoostClassifier(n_estimators = n) clf.fit(x, t) predictions = clf.predict(x_test) X = confusion_matrix(t_test,predictions) classificationRate = (X[1,1]+X[0,0]) / sum(sum(X)) return(1-classificationRate)
def ada(xtrain, ytrain, train_weight, tests, test_weight): #Initiate the training model clf = AdaBoostClassifier() mistakes = 0 cost = 0 #Fit the model clf.fit(xtrain, ytrain) vector_count = 0 #Iterate over the tests for i in range(len(tests)): #Get the number of elements in each test vector_count += len(tests[i]) test_count = 0 #Iterate over each feature in the tests for vector in tests[i]: #Predict based on each feature prediction = clf.predict(vector) #Determine the cost cost += test_weight[i][test_count] * pen[i][prediction[0]] #Count the number of mistakes if pen[i][prediction[0]] > 0: #print("Incorrectly Predicted " + str(Segments.reverse_mapping[i]) + " as " + str(Segments.reverse_mapping[prediction[0]])) mistakes += 1 test_count += 1 print("Number of mistakes: " + str(mistakes) + " of " + \ str(vector_count) + ", " + \ str((1.-float(mistakes)/float(vector_count))*100) + \ "% accurate") return cost
class AdaBoost: def __init__(self, data, n_estimators=50, learning_rate=1.0): features, weights, labels = data self.clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate) self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None self.dataset = split_dataset(features, weights, labels) def train(self): """ Train Ada Boost on the higgs dataset """ self.clf = self.clf.fit(self.dataset['training']['features'], self.dataset['training']['labels']) def predict(self): """ Predict label using Ada Boost :return: """ self.predictions = self.clf.predict(self.dataset['test']['features']) def evaluate(self): self.trnaccuracy = self.clf.score(self.dataset['training']['features'], self.dataset['training']['labels'], sample_weight=self.dataset['training']['weights']) self.tstaccuracy = self.clf.score(self.dataset['test']['features'], self.dataset['test']['labels'], sample_weight=self.dataset['test']['weights'])
def some(X, Y, X_test, Y_test): ada = AdaBoostClassifier() print "Train Model ---" t1 = time() ada.fit(X, Y) t2 = time() print "Model Trained ----------", t2 - t1 test_errors = [] cur = 1 Y_test2 = [] for k in Y_test: Y_test2.append(k[0]) print "Testing: " print Y_test2 pred = ada.predict(X_test) print pred accu = 1. - accuracy_score(y_true= Y_test2, y_pred= pred) print accu print "STAGED _____________" for test_predict in ( ada.staged_predict(X_test)): test_errors.append( 1. - accuracy_score(test_predict, Y_test2)) print "errorss : " print test_errors
def trainClassifier(dataDir, trialName, NUMFISH): ch = circularHOGExtractor(6,4,3) nFeats = ch.getNumFields()+1 trainData = np.array([])#np.zeros((len(lst0)+len(lst0c)+len(lst1),nFeats)) targetData = np.array([])#np.hstack((np.zeros(len(lst0)+len(lst0c)),np.ones(len(lst1)))) for tr in range(NUMFISH): directory = dataDir + '/process/' + trialName + '/FR_ID' + str(tr) + '/' files = [name for name in os.listdir(directory)] thisData = np.zeros((len(files),nFeats)) thisTarget = tr*np.ones(len(files)) i = 0 for imName in files: sample = cv2.imread(directory + imName) thisIm = cv2.cvtColor(sample, cv2.COLOR_BGR2GRAY) thisData[i,:] = np.hstack((ch.extract(thisIm), np.mean(thisIm))) i = i + 1 trainData = np.vstack((trainData, thisData)) if trainData.size else thisData targetData = np.hstack((targetData, thisTarget)) if targetData.size else thisTarget #clf = svm.SVC() clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=50) y_pred = clf.fit(trainData,targetData) pickle.dump(clf, open( dataDir + '/process/' + trialName + '/boost' + trialName + '.p',"wb")) y_pred = clf.predict(trainData) print("Number of mislabeled points out of a total %d points : %d" % (trainData.shape[0],(targetData != y_pred).sum()))
def main(): print("gradient boosting classifier!") X,Y,Xtest = importdata() print(Y.shape) param_grid={ "n_estimators":[10,100,200,2000,20000], "base_estimator__n_estimators":[10,20,50,100,200], "base_estimator__min_samples_split":[5,10,20,50] } ab=AdaBoostClassifier(RandomForestClassifier()) Gridsearch_impl(X,Y,ab,param_grid,5) # for i in range(10,11,5): # clf = DecisionTreeClassifier(min_samples_split=i) # rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i) # ab = AdaBoostClassifier(rf,n_estimators = 10) #ab = GradientBoostingClassifier(n_estimators = 100) # score = cross_validation.cross_val_score(ab,X,Y,cv=3) # print(score) # print("average score %f"%np.mean(score)) # print("std %f"%np.std(score)) # ab.fit(X,Y) Ytest = ab.predict(Xtest) output(Ytest,'submit3.csv')
from sklearn.ensemble import AdaBoostClassifier from sklearn.datasets import make_classification clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(train_notes_set[['Variance','Skewness','Kurtosis','Entropy']], train_notes_set['Class']) # In[75]: clf.feature_importances_ # In[76]: clf.predict(test_notes_set[['Variance','Skewness','Kurtosis','Entropy']]) # In[60]: clf.score(test_notes_set[['Variance','Skewness','Kurtosis','Entropy']], test_notes_set['Class']) # In[61]: confusion_matrix(test_notes_set['Class'],clf.predict(test_notes_set[['Variance','Skewness','Kurtosis','Entropy']])) # In[62]:
dummy1.append(get_lexicon_value(text[i], term_Loc[i])) for i in dummy1: val = TextBlob(i).sentiment pol1.append(val[0]) sub1.append(val[1]) df['Pol'] = np.array(pol1).reshape(-1, 1) df['Sub'] = np.array(sub1).reshape(-1, 1) # X = df['Pol'].reshape(-1,1) X1 = df[['Pol', 'Sub']] Xnew = df[['Pol', 'Sub']] bdt_discrete.fit(X, Y) result = bdt_discrete.predict(Xnew) print(result) # In[56]: #outputting the results to the file #change the file name. f = open('output.txt', 'a') id = df.ID for i in range(len(df['ID'])): f.write(str(id[i]) + ";;" + str(result[i]) + '\n') f.close()
auxp.append(i) print('Pérdida con un vector de etiquetas aleatorias: ', hinge_loss(auxy, auxW)) print('Pérdida con valores obtenidos: ', hinge_loss(auxy, auxp)) #Adaboost input("Pulse una tecla para pasar al siguiente modelo") clf = AdaBoostClassifier(n_estimators=25) clf.fit(trainx, trainy) print('\nResultados ADABOOST: ') print('Precisión para los valores de entrenamiento: ', clf.score(trainx, trainy)) print('Precisión para los valores de prueba: ', clf.score(testx, testy)) pred = clf.predict(testx) auxp = [] for i in pred: if i == 0: auxp.append(-1) else: auxp.append(i) print('Pérdida con un vector de etiquetas aleatorias: ', hinge_loss(auxy, auxW)) print('Pérdida con valores obtenidos: ', hinge_loss(auxy, auxp)) #Random Forest input("Pulse una tecla para pasar al siguiente modelo") rf = RandomForestClassifier(n_estimators=1,
# random forest pred_scores = [] for i in range(2, 36): rfc = RandomForestClassifier(n_estimators=i, random_state=111) rfc.fit(x_train1, y_train1) pred = rfc.predict(x_test1) pred_scores.append((i, [accuracy_score(y_test1, pred)])) df = pd.DataFrame.from_items(pred_scores, orient='index', columns=['Score']) df[df['Score'] == df['Score'].max()] # AdaBoosting pred_scores = [] for i in range(25, 76): abc = AdaBoostClassifier(n_estimators=i, random_state=111) abc.fit(x_train1, y_train1) pred = abc.predict(x_test1) pred_scores.append((i, [accuracy_score(y_test1, pred)])) df = pd.DataFrame.from_items(pred_scores, orient='index', columns=['Score']) df[df['Score'] == df['Score'].max()] # bagging pred_scores = [] for i in range(2, 21): bc = BaggingClassifier(n_estimators=i, random_state=111) bc.fit(x_train1, y_train1) pred = bc.predict(x_test1) pred_scores.append((i, [accuracy_score(y_test1, pred)])) df = pd.DataFrame.from_items(pred_scores, orient='index', columns=['Score']) df[df['Score'] == df['Score'].max()] # TfidfVectorizer, tune parapmeters for each algorithm
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier tree = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=1) ada = AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=1) from sklearn.metrics import accuracy_score tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, y_train_pred) tree_test = accuracy_score(y_test, y_test_pred) print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test)) # AdaBoost Accuracy ada = ada.fit(X_train, y_train) y_train_pred = ada.predict(X_train) y_test_pred = ada.predict(X_test) ada_train = accuracy_score(y_train, y_train_pred) ada_test = accuracy_score(y_test, y_test_pred) print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))
data = pd.read_csv(filename).dropna() feature_name = data.columns[2:-1] data = data.values seed(0) name = data[:, 0] y = data[:, 1] == 'EUROPE' # make class labels +-1 y = y.astype('int') * 2 - 1 X = data[:, 2:].astype('float') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) algo = LogisticRegression() model = AdaBoostClassifier(base_estimator=algo, n_estimators=10) model.fit(X_train, y_train) y_pred = model.predict(X_test) print(accuracy_score(y_pred, y_test)) model.estimator_weights_ log_w_i = np.zeros(len(y_train), dtype='float') for i in range(len(log_w_i)): item = np.reshape(X_train[i, :], (1, 18)) for j in range(len(model.estimators_)): y_pred = model.estimators_[j].predict(item) alpha = model.estimator_weights_[j] log_w_i[i] -= alpha * y_train[i] * y_pred[0] w_i = np.exp(log_w_i) w_i = w_i / np.sum(w_i) not_outliers = w_i < np.mean( np.sort(w_i)[:-20]) + 3 * np.std(np.sort(w_i)[:-20])
n = 25 clf1 = ExtraTreesClassifier(n_estimators=n) #print "Beginning model training." clf1.fit(X_train, y_train) #print "Model training completed." # Use the trained classifier to make predictions on the test data predictions_etree = clf1.predict(X_test) #print "Predictions on testing data computed." # Print the accuracy (percentage of phishing websites correctly predicted) accuracy = 100.0 * accuracy_score(y_test, predictions_etree) print "The accuracy of your decision tree on testing data is: " + str(accuracy) print print "===============AdaBoost===============\n" clf2 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n) #print "Beginning model training." clf2.fit(X_train, y_train) #print "Model training completed." # Use the trained classifier to make predictions on the test data predictions = clf2.predict(X_test) #print "Predictions on testing data computed." # Print the accuracy (percentage of phishing websites correctly predicted) accuracy = 100.0 * accuracy_score(y_test, predictions) print "The accuracy of your decision tree on testing data is: " + str(accuracy) print
def main(): # load pickle arxiv_11 = pickle.load(open("2011_big_pop.p", "rb")) arxiv_12 = pickle.load(open("2012_big_pop.p", "rb")) print "loaded pickles" # build doc set doc_set = arxiv_11['astro'] + arxiv_11['cond'] + \ arxiv_11['cs'] + arxiv_11['hep'] + \ arxiv_11['math'] + arxiv_11['physics'] + \ arxiv_11['quant'] + arxiv_11['stat'] label_set = [1]*len(arxiv_11['astro']) + [2]*len(arxiv_11['cond']) + \ [3]*len(arxiv_11['cs']) + [4]*len(arxiv_11['hep']) + \ [5]*len(arxiv_11['math']) + [6]*len(arxiv_11['physics']) + \ [7]*len(arxiv_11['quant']) + [8]*len(arxiv_11['stat']) # list for tokenized documents in loop texts = tokenize(doc_set) # turn our tokenized documents into a id - term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model num_topics = 450 ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=20) print "LDA built" # print(ldamodel.print_topics(num_topics=2, num_words=3)) # look at topic proportion of one document # print ldamodel[dictionary.doc2bow(texts[0])] # build topic proportion matrix topicPropArray = np.zeros((len(texts), num_topics)) for i in range(len(texts)): text = texts[i] textProp = ldamodel[dictionary.doc2bow(text)] for pair in textProp: topicIdx = pair[0] weight = pair[1] topicPropArray[i, topicIdx] = weight # print topicPropArray print "matrix built" print "------------------" print "testing" # test on new data test_set = arxiv_12['astro'][0:99] + arxiv_12['cond'][0:99] + \ arxiv_12['cs'][0:99] + arxiv_12['hep'][0:99] + \ arxiv_12['math'][0:99] + arxiv_12['physics'][0:99] + \ arxiv_12['quant'][0:99] + arxiv_12['stat'][0:99] print "test_set length : " + str(len(test_set)) test_label = [1]*100 + [2]*100 + [3]*100 + [4]*100 + [5]*100 + \ [6]*100 + [7]*100 + [8]*100 print "test_label length : " + str(len(test_label)) test_texts = tokenize(test_set) # build test features testPropArray = np.zeros((800, num_topics)) for i in range(len(test_texts)): test = test_texts[i] testProp = ldamodel[dictionary.doc2bow(test)] for pair in testProp: topicIdx = pair[0] weight = pair[1] testPropArray[i, topicIdx] = weight # all testing X_train, X_test, y_train, y_test = topicPropArray, testPropArray, label_set, test_label print "training_array length: " + str(len(topicPropArray)) print "test_array length: " + str(len(testPropArray)) print "training_label length: " + str(len(label_set)) print "test_label length: " + str(len(test_label)) print '--------------------------------' # knn3 knn3 = KNeighborsClassifier(n_neighbors=3) knn3.fit(X_train, y_train) predictions = knn3.predict(X_test) cm = confusion_matrix(y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8']) np.savetxt('knn3pred.csv', predictions.astype(int), fmt='%i', delimiter=",") np.savetxt('knn3cm.txt', cm.astype(int), fmt='%i', delimiter=",") # print predictions print 'knn3' print zero_one_loss(predictions, y_test) print '--------------------------------' # knn5 knn5 = KNeighborsClassifier(n_neighbors=5) knn5.fit(X_train, y_train) predictions = knn5.predict(X_test) cm = confusion_matrix(y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8']) np.savetxt('knn5pred.csv', predictions.astype(int), fmt='%i', delimiter=",") np.savetxt('knn5cm.txt', cm.astype(int), fmt='%i', delimiter=",") # print predictions print 'knn5' print zero_one_loss(predictions, y_test) print '--------------------------------' # svmlin svmlin = svm.SVC(kernel='linear') svmlin.fit(X_train, y_train) predictions = svmlin.predict(X_test) cm = confusion_matrix(y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8']) np.savetxt('svmlinpred.csv', predictions.astype(int), fmt='%i', delimiter=",") np.savetxt('svmlincm.txt', cm.astype(int), fmt='%i', delimiter=",") # print predictions print 'svmlin' print zero_one_loss(predictions, y_test) print '--------------------------------' # gnb gnb = GaussianNB() gnb.fit(X_train, y_train) predictions = gnb.predict(X_test) cm = confusion_matrix(y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8']) np.savetxt('gnbpred.csv', predictions.astype(int), fmt='%i', delimiter=",") np.savetxt('gnbcm.txt', cm.astype(int), fmt='%i', delimiter=",") # print predictions print 'gnb' print zero_one_loss(predictions, y_test) print '--------------------------------' # rf50 rf50 = RandomForestClassifier(n_estimators=50) rf50.fit(X_train, y_train) predictions = rf50.predict(X_test) cm = confusion_matrix(y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8']) np.savetxt('rf50pred.csv', predictions.astype(int), fmt='%i', delimiter=",") np.savetxt('rf50cm.txt', cm.astype(int), fmt='%i', delimiter=",") # print predictions print 'rf50' print zero_one_loss(predictions, y_test) print '--------------------------------' # dtree ada ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=400, learning_rate=1, algorithm="SAMME", random_state=None) n_estimators = 400 ada.fit(X_train, y_train) predictions = ada.predict(X_test) cm = confusion_matrix(y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8']) np.savetxt('adapred.csv', predictions.astype(int), fmt='%i', delimiter=",") np.savetxt('adacm.txt', cm.astype(int), fmt='%i', delimiter=",") # print predictions print 'ada' print zero_one_loss(predictions, y_test) print '--------------------------------'
merge_data = np.concatenate([train_output, train_data], axis=1) bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5), algorithm="SAMME", n_estimators=200, learning_rate=0.8) bdt.fit(merge_data, train_lable) #--------------------------------- #-------------test---------------- test_data = [] test_lable = [] for i in range(len(test_list)): if i not in random_num: test_data.append(x_train[test_list[i]]) test_lable.append(y_train[test_list[i]]) test_data = np.array(test_data) test_lable = np.array(test_lable) print('train_data:',train_data.shape) encode_train_output = K.function([model.layers[0].input], [model.layers[2].output]) test_output = encode_train_output([test_data, 0])[0] print('encode_output:',test_output.shape) print(test_output) merge_data = np.concatenate([test_output, test_data], axis=1) pred = bdt.predict(merge_data) print(pred) Evaluate_Function.Evaluate_Fun(pred, test_lable, merge_data)
#mnb.fit(x_train,y_train) knc.fit(x_train, y_train) dtc.fit(x_train, y_train) rfc.fit(x_train, y_train) gbc.fit(x_train, y_train) abc.fit(x_train, y_train) svc.fit(x_train, y_train) gnb.fit(x_train, y_train) LR.fit(x_train, y_train) #y_predict_mnb=mnb.predict(x_test) y_predict_knc = knc.predict(x_test) y_predict_dtc = dtc.predict(x_test) y_predict_rfc = rfc.predict(x_test) y_predict_gbc = gbc.predict(x_test) y_predict_abc = abc.predict(x_test) y_predict_svc = svc.predict(x_test) y_predict_gnb = gnb.predict(x_test) y_predict_lr = LR.predict(x_test) from sklearn.metrics import classification_report print('\n1:') print("DTC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_dtc)) print("\nKNC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_knc)) print("\nRFC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_rfc)) print("\nGBC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_gbc)) print("\nAda confusioin_matrix:\n", confusion_matrix(y_test, y_predict_abc)) print("\nSVC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_svc)) print("\nGauNB confusioin_matrix:\n", confusion_matrix(y_test, y_predict_gnb)) print("\nLR confusioin_matrix:\n", confusion_matrix(y_test, y_predict_lr))
# -------------- from sklearn.ensemble import AdaBoostClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Code starts here #print("--"*25,"X_train") #print(X_train) #print("--"*25,"X_test") #print(X_test) #print("--"*25,"y_train") #print(y_train) #print("--"*25,"y_test") #print(y_test) ada_model = AdaBoostClassifier(random_state=0) ada_model.fit(X_train, y_train) y_pred = ada_model.predict(X_test) ada_score = accuracy_score(y_test, y_pred) ada_cm = confusion_matrix(y_test, y_pred) ada_cr = classification_report(y_test, y_pred) print("Accuracy score :", ada_score) print("Cdafusion Matrix :", ada_cm) print("Classification Report :", ada_cr) # -------------- from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV #Parameter list parameters = { 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3], 'max_depth': range(1, 3)
def fxn(): #read in the data df = pd.read_csv('data.csv') #columns to drop df = df.drop(['id'], axis=1) df.sample(frac=1) #gets rid of ? and one hot encoding for all columns that need it index = [] count = 0 for val in range(len(df.ix[:, 0])): flag = False for column in df: if df[column][val] == '?': flag = True break if flag: continue if count < 1000: index.append(val) count += 1 df = df[df.index.isin(index)] #gets all columns which are not ints and integer encodes them obj_df = df.select_dtypes(include=['object']).copy() for column in obj_df: le = preprocessing.LabelEncoder() le.fit(df[column]) df[column] = le.transform(df[column]) #normalize all points between [0,1] x = df.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df = pd.DataFrame(x_scaled) # In[589]: #make dataset only 1100 #create 500/500 split between labelled on nonlablled array, 1000 semi-sup data set, and 100 validation dataset train, test = np.split(df.sample(frac=1), [int(.8 * len(df))]) #print(train) train = train.values.tolist() test = test.values.tolist() df_unsupervised = [] label_nolabels = {} for point in train: #unlablled 1000 points data df_unsupervised.append(point[1:]) label_nolabels[tuple(point[1:])] = [point[0]] # In[590]: ##### #kmeans_forest 1-10, unsupervised learning adaboosting # kmeans1 = KMeans(n_clusters=2).fit(df_unsupervised) # # #kmeans2 = SpectralClustering(n_clusters = 2).fit_predict(df_unsupervised).tolist() # # kmeans3 = MeanShift().fit(df_unsupervised) # # #kmeans4 = AgglomerativeClustering(n_clusters=2).fit_predict(df_unsupervised).tolist() # # kmeans5 = DBSCAN().fit_predict(df_unsupervised).tolist() # # kmeans6 = GaussianMixture(n_components=2).fit(df_unsupervised) # # kmeans7 = Birch(n_clusters=2).fit(df_unsupervised) # # kmeans8 = BayesianGaussianMixture(n_components=2).fit(df_unsupervised) # classifiers = [kmeans1, kmeans3, kmeans5, kmeans6, kmeans7, kmeans8] #kmeans_forest 1-10, unsupervised learning adaboosting kmeans1 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans2 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans3 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans4 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans5 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans6 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans7 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans8 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans9 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans10 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans11 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans12 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans13 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans14 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans15 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans16 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans17 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans18 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans19 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans20 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans21 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans22 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans23 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans24 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans25 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans26 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans27 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans28 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans29 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans30 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans31 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans32 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans33 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans34 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans35 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans36 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans37 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans38 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans39 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans40 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans41 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans42 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans43 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans44 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans45 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans46 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans47 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans48 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans49 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) kmeans50 = KMeans(n_clusters=2, init='random', n_init=10).fit(np.asarray(df_unsupervised)) classifiers = [ kmeans1, kmeans2, kmeans3, kmeans4, kmeans5, kmeans6, kmeans7, kmeans8, kmeans9, kmeans10, kmeans11, kmeans12, kmeans13, kmeans14, kmeans15, kmeans16, kmeans17, kmeans18, kmeans19, kmeans20, kmeans21, kmeans22, kmeans23, kmeans24, kmeans25, kmeans26, kmeans27, kmeans28, kmeans29, kmeans30, kmeans31, kmeans32, kmeans33, kmeans34, kmeans35, kmeans36, kmeans37, kmeans38, kmeans39, kmeans40, kmeans41, kmeans42, kmeans43, kmeans44, kmeans45, kmeans46, kmeans47, kmeans48, kmeans49, kmeans50 ] # In[591]: # make csv in form of rowNumber, clfNumber, clf prediction on that row answers = [] for point in range(len(df_unsupervised)): for clf in range(len(classifiers)): answers.append([ point, clf, classifiers[clf].predict([df_unsupervised[point]]) ]) count = 0 f = open("answer_file.csv", "w") f.write('question,worker,answer;\n') for answer in answers: count += 1 f.write( str(answer[0]) + ',' + str(answer[1]) + ',' + str(int(answer[2])) + '\n') f.close() p = open("result_file.csv", "w") p.close() # In[592]: #run VI BP import subprocess subprocess.call([ "python", "run.py", "methods/c_EM/method.py", "answer_file.csv", "result_file.csv", "decision-making" ]) # In[593]: #extract results, get noisy labels and filepath = "result_file.csv" noisy_labels = [] with open(filepath) as fp: for line in fp: questionAnswer = line.split(',') noisy_labels.append(questionAnswer) # In[594]: #assign noisy label to proper row df_noise_x = [] df_noise_y = [] for question in noisy_labels: if question[0].rstrip() == 'question': continue df_noise_x += [df_unsupervised[int(question[0].rstrip())]] df_noise_y.append(int(question[1].rstrip())) count_vi = 0 for el in range(len(df_noise_x)): if label_nolabels[tuple(df_noise_x[el])][0] != df_noise_y[el]: count_vi += 1 print(count_vi, len(df_noise_x)) # In[595]: df_noise_y2 = [] for el in df_noise_y: df_noise_y2.append(int(el)) df_noise = [] for el in range(len(df_noise_x)): new = df_noise_x[el] new.append(df_noise_y2[el]) df_noise.append(new) #need to shuffle the data random.shuffle(df_noise) df_noise_x = [] df_noise_y = [] for row in df_noise: df_noise_x.append(row[:-1]) df_noise_y.append(row[-1:][0]) # In[596]: #run AdaBoost from Sklearn on noisy data bdt2 = AdaBoostClassifier(DecisionTreeClassifier(), algorithm="SAMME", n_estimators=20) bdt2.fit(df_noise_x, df_noise_y) # In[597]: #Ada boosting on noisy data error rate errors = [] count1 = 0 for point in test: est = bdt2.predict([point[:-1]]) true = int(point[-1:][0]) est = int(est[0]) if est == true: errors.append([point[:-1], 0]) else: count1 += 1 errors.append([point[:-1], 1]) # error rate, noisy -> baseline return (count1 / len(test))
# Random Forest Classifier (bootstrap aggreagted decision trees) RFmodel = RandomForestClassifier(max_depth=4, random_state=0) RFmodel.fit(X_train, y_train) # print feature importances, the higher the number the more important print(RFmodel.feature_importances_) # print Random Forest prediction accuracy score RFpred = RFmodel.predict(X_test) print(accuracy_score(y_test, RFpred) * 100) cm = pd.DataFrame(confusion_matrix(y_test, RFpred)) print(cm) # AdaBoost (Boosted Tree) ABmodel = AdaBoostClassifier() ABmodel.fit(X_train, y_train) ABpred = ABmodel.predict(X_test) print(accuracy_score(y_test, ABpred) * 100) # Compare Decision Tree, Random Forest, AdaBoost DTtest = accuracy_score(y_test, pred) * 100 RFtest = accuracy_score(y_test, RFpred) * 100 ABtest = accuracy_score(y_test, ABpred) * 100 print("Prediction Accuracy Scores:") print("Decision Tree: ", DTtest) print("Random Forest: ", RFtest) print("AdaBoost: ", ABtest)
y = df.Class x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30) tfidf_vect = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, tokenizer=tokenizer_porter, use_idf=True, norm='l2', smooth_idf=True, stop_words=spanish_stopwords) tfidf_train = tfidf_vect.fit_transform(x_train) tfidf_test = tfidf_vect.transform(x_test) tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vect.get_feature_names()) Adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=5, random_state=1) Adab.fit(tfidf_train, y_train) y_pred3 = Adab.predict(tfidf_test) ABscore = metrics.accuracy_score(y_test, y_pred3) print("accuracy: %0.3f" % ABscore) DecTree = open('DecTree.sav', 'wb') pickle.dump(Adab, DecTree) DecTree.close() # Accuracy: 0.777
dataMat = [] labelMat = [] fr = open(fileName) for line in fr.readlines(): lineArr = [] curLine = line.strip().split('\t') for i in range(numFeat - 1): lineArr.append(float(curLine[i])) dataMat.append(lineArr) labelMat.append(float(curLine[-1])) return dataMat, labelMat if __name__ == '__main__': dataArr, classLabels = loadDataSet( 'D:\Project\Machinelearning\Logistic\horseColicTraining.txt') testArr, testLabelArr = loadDataSet( 'D:\Project\Machinelearning\Logistic\horseColicTest.txt') bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), algorithm="SAMME", n_estimators=10) bdt.fit(dataArr, classLabels) predictions = bdt.predict(dataArr) errArr = np.mat(np.ones((len(dataArr), 1))) print('训练集的错误率:%.3f%%' % float(errArr[predictions != classLabels].sum() / len(dataArr) * 100)) predictions = bdt.predict(testArr) errArr = np.mat(np.ones((len(testArr), 1))) print( '测试集的错误率:%.3f%%' % float(errArr[predictions != testLabelArr].sum() / len(testArr) * 100))
def train_bdt(): print("Loading data...") if SMALL_DATA: signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data_small() else: signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data() # print("Sampling 10% of the data for training") # #Create smaller samples, 10% of the size # signal = np.asarray(random.sample(signal, int((len(signal))*0.1))) # bkg2nu = np.asarray(random.sample(bkg2nu, int((len(bkg2nu))*0.1))) # bkg214Bi = np.asarray(random.sample(bkg214Bi, int((len(bkg214Bi))*0.1))) # bkg208Tl = np.asarray(random.sample(bkg208Tl, int((len(bkg208Tl))*0.1))) # bkgRn = np.asarray(random.sample(bkgRn, int((len(bkgRn))*0.1))) print("Creating arrays...") # X = Features (i.e. the data) X = np.concatenate((signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn)) # y = Labels (i.e. what it is, signal / background) y = np.concatenate( (np.ones(signal.shape[0]), np.zeros(bkg2nu.shape[0]), np.zeros(bkg214Bi.shape[0]), np.zeros(bkg208Tl.shape[0]), np.zeros(bkgRn.shape[0]))) print("Splitting Data...") # Split the data X_dev, X_eval, y_dev, y_eval = train_test_split(X, y, test_size=0.33, random_state=48) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # print("Oversampling...") # # Oversample to improve representation of backgrounds # ros = RandomOverSampler(random_state=0) # X_resampled, y_resampled = ros.fit_sample(X_train, y_train) # X_test_resampled, y_test_resampled = ros.fit_sample(X_test, y_test) # X_dev_resampled, y_dev_resampled = ros.fit_sample(X_dev, y_dev) # X_eval_resampled, y_eval_resampled = ros.fit_sample(X_eval, y_eval) # print(sorted(Counter(y_resampled).items())) print("Removing weights..") # Remove weights on backgrounds (will be passed in to the BDT later) # 30/09/19 - removed re sampling X_train_weights = X_train[:, 6] X_train_new = np.delete(X_train, 6, axis=1) X_test_new = np.delete(X_test, 6, axis=1) X_dev_weights = X_dev[:, 6] X_dev_new = np.delete(X_dev, 6, axis=1) X_eval_new = np.delete(X_eval, 6, axis=1) print("Creating classifier for DT") # Create classifiers dt = DecisionTreeClassifier(max_depth=12, min_samples_split=0.5, min_samples_leaf=400) print("Creating classifier for BDT") bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=1200, learning_rate=0.5) print("Fitting BDT...") # Train the classifier - pass in weights from earlier fitted_tree = bdt.fit(X_train_new, y_train, sample_weight=X_train_weights) print("Predicting on training data...") # Use the fitted tree to predict on training data and new test data y_predicted_train = bdt.predict(X_train_new) print("Predicting on test data...") y_predicted_test = bdt.predict(X_test_new) print( classification_report(y_train, y_predicted_train, target_names=["signal", "background"])) print("Area under ROC curve for training data: {0:.4f}".format( roc_auc_score(y_train, bdt.decision_function(X_train_new)))) print( classification_report(y_test, y_predicted_test, target_names=["signal", "background"])) print("Area under ROC curve for test data: {0:.4f}".format( roc_auc_score(y_test, bdt.decision_function(X_test_new)))) plot_roc_curve(bdt, X_test_new, y_test) compare_train_test(bdt, X_train_new, y_train, X_test_new, y_test) print("Saving classifier...") save_path = BASE_PATH + 'ml_calculated_data/weight/' dump(bdt, save_path + 'bdt_classifier.joblib') dump(fitted_tree, save_path + 'bdt_fitted_tree.joblib') dump(X_train_new, save_path + 'bdt_X_train_new.joblib') dump(X_test_new, save_path + 'bdt_X_test_new.joblib') dump(X_dev_new, save_path + 'bdt_X_dev_new.joblib') dump(X_dev_weights, save_path + 'bdt_X_dev_weights.joblib') dump(X_eval_new, save_path + 'bdt_X_eval_new.joblib') dump(y_test, save_path + 'bdt_y_test.joblib') dump(y_train, save_path + 'bdt_y_train.joblib') dump(y_dev, save_path + 'bdt_y_dev.joblib') dump(y_eval, save_path + 'bdt_y_eval.joblib') print("Finished Training.")
# clf = KNeighborsClassifier(n_neighbors= 5) ### Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html ''' Best model tested: clf = AdaBoostClassifier(n_estimators= 8, learning_rate = 0.7, random_state= 1) features = ['expenses', 'exercised_stock_options', 'other', 'from_ratio'] P: 0.63 | R: 0.45 ''' # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) clf.fit(features_train, labels_train) pred = clf.predict(features_test) ### Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
def train_bdt_multiclass(): print("Loading data...") if SMALL_DATA: signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data_small() else: signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn = import_data() print("Creating arrays...") # X = Features (i.e. the data) X = np.concatenate((signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn)) # y = Labels (i.e. what it is, signal / background) y = np.concatenate((np.ones(signal.shape[0]), np.full(bkg2nu.shape[0], 2), np.full(bkg214Bi.shape[0], 3), np.full(bkg208Tl.shape[0], 4), np.full(bkgRn.shape[0], 5))) print("Splitting Data...") # Split the data X_dev, X_eval, y_dev, y_eval = train_test_split(X, y, test_size=0.33, random_state=48) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) print("Creating classifier for DT") # Create classifiers dt = DecisionTreeClassifier(max_depth=12, min_samples_split=0.5, min_samples_leaf=400) print("Creating classifier for BDT") bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=1200, learning_rate=0.5) print("Fitting BDT...") # Train the classifier - not using weights here as it is a multiclassifier fitted_tree = bdt.fit(X_train, y_train) print("Predicting on training data...") # Use the fitted tree to predict on training data and new test data y_predicted_train = bdt.predict(X_train) print("Predicting on test data...") y_predicted_test = bdt.predict(X_test) print( classification_report( y_train, y_predicted_train, target_names=["signal", "2nu", "214Bi", "208Tl", "Radon"])) print("Area under ROC curve for training data: {0:.4f}".format( roc_auc_score(y_train, bdt.predict_proba(X_train), average="weighted", multi_class="ovr"))) print( classification_report( y_test, y_predicted_test, target_names=["signal", "2nu", "214Bi", "208Tl", "Radon"])) print("Area under ROC curve for test data: {0:.4f}".format( roc_auc_score(y_test, bdt.predict_proba(X_test), average="weighted", multi_class="ovr"))) plot_roc_curve(bdt, X_test, y_test) compare_train_test_multi(bdt, X_train, y_train, X_test, y_test) print("Saving classifier...") save_path = BASE_PATH + 'ml_calculated_data/multiClass/' dump(bdt, save_path + 'bdt_classifier.joblib') dump(fitted_tree, save_path + 'bdt_fitted_tree.joblib') dump(X_train, save_path + 'bdt_X_train.joblib') dump(X_test, save_path + 'bdt_X_test.joblib') dump(X_dev, save_path + 'bdt_X_dev.joblib') dump(X_eval, save_path + 'bdt_X_eval.joblib') dump(y_test, save_path + 'bdt_y_test.joblib') dump(y_train, save_path + 'bdt_y_train.joblib') dump(y_dev, save_path + 'bdt_y_dev.joblib') dump(y_eval, save_path + 'bdt_y_eval.joblib') print("Finished Training.")
#Checking the R2_score print("R2 score of the model is ", r2_score(y_test, y_pred4)) print("Negative value shows the model doesn't follow a linear trend") #Classification Report print("Classification report is given as ") print(classification_report(y_test, y_pred4)) #F1-score print("F1-score of the model is ", f1_score(y_test, y_pred4)) #Useing of adaboost classifier from sklearn.ensemble import AdaBoostClassifier classifier_ada = AdaBoostClassifier(n_estimators=1000, learning_rate=1, random_state=0) classifier_ada.fit(X_train, y_train) y_pred_ada = classifier_ada.predict(X_test) print("accuracy of adaboost classifier is ", accuracy_score(y_test, y_pred_ada)) #Hyperparameter tuning of Adaboost Classifier parameters_ada = [{ 'n_estimators': [100, 200, 250, 1000, 10000], 'learning_rate': [0.1, 0.2, 0.5, 0.8, 1, 2] }] grid_search_ada = GridSearchCV(estimator=classifier_ada, param_grid=parameters_ada, scoring='accuracy', cv=10, n_jobs=-1) grid_search_ada.fit(X_train, y_train)
# In[74]: rfc = RandomForestClassifier(n_estimators=100, criterion='entropy') rfc = grid.fit(xtr, ytr) print(confusion_matrix(yte, rfc.predict(xte))) # In[69]: #lets use adaboost # In[71]: from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(xtr, ytr) print(confusion_matrix(yte, clf.predict(xte))) # In[72]: #lets use gradientboost # In[73]: from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(xtr, ytr) print(confusion_matrix(yte, gbc.predict(xte))) # In[76]: #svc
# param_grid={"n_estimators": range(500, 1501, 100)}, cv=3) # # 拟合训练数据集 # model.fit(train_X, train_Y) # print "最好的参数是:%s, 此时的得分是:%0.2f" % (model.best_params_, model.best_score_) model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( splitter='random', max_features=90, max_depth=50, min_samples_split=6, min_samples_leaf=3), n_estimators=1200, learning_rate=0.005) # 拟合训练数据集 model.fit(train_X, train_Y) # 预测训练集 train_Y_hat = model.predict(train_X[idx]) print "训练集精确度: ", accuracy_score(train_Y[idx], train_Y_hat) # 预测测试集 test_Y_hat = model.predict(test_X) print "测试集精确度: ", accuracy_score(test_Y, test_Y_hat) print "总耗时:", time() - t, "秒" # 绘制ROC曲线 n_class = len(np.unique(train_Y)) roc.drawROC(n_class, test_Y, test_Y_hat) # 读取CCPP数据集, 测试AdaBoost的回归模型 data = pd.read_excel("data/CCPP/Folds5x2_pp.xlsx") # AT:温度, V:压力, AP:湿度, RH:压强, PE:输出电力 # 样本特征X X = data[['AT', 'V', 'AP', 'RH']] # 数据归一化
accuracies['NB'] = totalScore logLosses['NB'] = totalLogLoss #predict category on testData test = pd.read_csv(testLocation) #predict clusters for test data data2 = test.get(['X', 'Y']) test_cluster_predict = est.predict(data2) test['cluster_ids'] = test_cluster_predict test_features, _ = dataMassaging(test) #RF predict classifier1 = RandomForestClassifier() classifier1.set_params(min_samples_split=1000) classifier1.fit(features, classes) predictions1 = classifier1.predict(test_features) visualizePrediction(predictions1) #AdaBoost predictx classifier2 = AdaBoostClassifier(n_estimators=50) classifier2.fit(features, classes) predictions2 = classifier2.predict(test_features) visualizePrediction(predictions2) #NB predict classifier3 = GaussianNB() classifier3.fit(features, classes) predictions3 = classifier3.predict(test_features) visualizePrediction(predictions3)
import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import make_gaussian_quantiles X1, y1 = make_gaussian_quantiles(cov=2.0,n_samples=500, n_features=2,n_classes=2, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5,n_samples=400, n_features=2, n_classes=2, random_state=1) #将两组数据合成一组数据 X = np.concatenate((X1, X2)) y = np.concatenate((y1, -y2 + 1)) # plt.plot(X,y) # plt.show() #创建分类器 bdt=AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),algorithm="SAMME",n_estimators=200, learning_rate=0.8) bdt.fit(X,y) x_min, x_max= X[:, 0].min() -1, X[:, 0].max() + 1 y_min, y_max= X[:, 1].min() -1, X[:, 1].max() + 1 xx, yy= np.meshgrid(np.arange(x_min, x_max, 0.02),np.arange(y_min, y_max, 0.02)) Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) plt.show()
random_state=24) #NOTE: change classifier here clf = AdaBoostClassifier(n_estimators=500, algorithm='SAMME') #training st = time.time() print "training started" clf.fit(x_train, y_train) print "training ended" et = time.time() tt = et - st print "Training Time = " + str(tt) + "\n" #predictions pred = clf.predict(x_test) #NOTE: change to decision_function or predict_proba depending on the classifier y_score = clf.predict_proba(x_test) #y_score = clf.decision_function(x_test) ################################################################################# pp = PdfPages('results/EXP_Result.pdf') #PrecisionRecall-plot precision = dict() recall = dict() PR_area = dict() PR_thresholds = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], PR_thresholds[i] = precision_recall_curve( y_test[:, i], y_score[:, i])
def abc(self, dataset_array, label_array, data_teste): from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=300) clf.fit(dataset_array, label_array) return clf.predict(data_teste)
features = features.transpose() arr2t = np.array(arr2) arr2t = arr2t.transpose() # print(features) # print(arr2t) # features.reshape classLabelX = arr2t clf = SVC(gamma='auto') clf.fit(features, classLabelX) clfAdaBoost = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) clfAdaBoost.fit(features, classLabelX) pred = clf.predict(features) predAb = clfAdaBoost.predict(features) # print('Adaboost Predictions : ', predAb) total = len(arr2) # print("Total : ",total) count = 0 for i in range(len(arr2)): if arr2[i] == predAb[i]: count += 1 print("Adaptive Boost accurate predictions : ", count, "/", total) print("Adaptive Boost Accuracy : ", (count / total) * 100) # print("--------------------------")
def boosting(): # Building and fitting clf = DecisionTreeClassifier(criterion='entropy', max_depth=1) boost = AdaBoostClassifier(base_estimator=clf, n_estimators=500) boost.fit(X_train, y_train) # make class predictions for the testing set y_pred_class = boost.predict(X_test) print('########### Boosting ###############') accuracy_score = evalClassModel(boost, y_test, y_pred_class, True) #Data for final graph methodDict['Boosting'] = accuracy_score * 100 boosting() # 7.stacking def stacking(): # Building and fitting clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() stack = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) stack.fit(X_train, y_train) # make class predictions for the testing set y_pred_class = stack.predict(X_test) print('########### Stacking ###############') accuracy_score = evalClassModel(stack, y_test, y_pred_class, True) #Data for final graph methodDict['Stacking'] = accuracy_score * 100 stacking() # 8.Success method plot def plotSuccess(): s = pd.Series(methodDict) s = s.sort_values(ascending=False) plt.figure(figsize=(12,8)) #Colors ax = s.plot(kind='bar') for p in ax.patches: ax.annotate(str(round(p.get_height(),2)), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.ylim([70.0, 90.0]) plt.xlabel('Method') plt.ylabel('Percentage') plt.title('Success of methods') plt.show() plotSuccess()
Bag = BaggingClassifier(base_estimator=model, n_estimators=100, random_state=10).fit(X_train, Y_train) BagPred = Bag.score(X_test, Y_test) BagPrediction = Bag.predict(X_test) print(metrics.confusion_matrix(Y_test, BagPrediction)) print('Bag Accuracy', BagPred) ''' predictions = cross_validate(AdaBoost,X_train,Y_train,cv=10) pred_per=np.mean(predictions['test_score']) print(predictions) print('The accuracy is: ',pred_per*100,'%') ''' AdaPred = AdaBoost.predict(X_test) print(AdaPred) print(metrics.confusion_matrix(Y_test, AdaPred)) prediction = AdaBoost.score(X_test, Y_test) print('The boosting accuracy is: ', prediction * 100, '%') ''' test_data=pd.read_csv('C:/Users/harsh/Desktop/KaggleComp/test.csv') print(test_data.shape) print(test_data.columns) test_data=test_data.loc[:,features].values #test_data=test_data[test_data.columns[1:86]] print(test_data.shape) test_predict=AdaBoost.predict(test_data) test_predict=np.reshape(test_predict,(1715,1))
# gs_tfidf_lsvc = load("./outputs/Pipeline_tfidf_lsvc.pkl") gs_cv_mlpc = load("./outputs/Pipeline_cv_mlpc.pkl") gs_tfidf_mlpc = load("./outputs/Pipeline_tfidf_mlpc.pkl") gs_cv_pac = load("./outputs/Pipeline_cv_pac.pkl") gs_tfidf_pac = load("./outputs/Pipeline_tfidf_pac.pkl") # Ensemble # Reference: https://scikit-learn.org/stable/modules/ensemble.html # AdaBoost clf = AdaBoostClassifier(base_estimator=[gs_cv_knc, gs_tfidf_knn], n_estimators=100, algorithm='SAMME', random_state=15) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Stacking classifier clf = StackingClassifier(estimators=[gs_cv_knc, gs_tfidf_knn], final_estimator=LogisticRegression( class_weight='balanced', multi_class='multinomial')) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # ('knn', gs_tfidf_knn), ('knc', gs_cv_knc) sclf = StackingCVClassifier(classifiers=[gs_cv_knc, gs_tfidf_knn], meta_classifier=LogisticRegression( class_weight='balanced', multi_class='multinomial'),
target = ["quality"] # data = shuffle(data) # data = data.reset_index() data = data.sample(frac=1).reset_index(drop=True) X = data[x].values Y = data[target].values standard_sc = StandardScaler() X = standard_sc.fit_transform(X) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y) # gbc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=20),n_estimators=200, learning_rate=1) abc = AdaBoostClassifier(base_estimator=RandomForestClassifier( n_estimators=200, max_depth=20, min_samples_split=4), n_estimators=200, learning_rate=1) abc.fit(X_train, Y_train) predictions = abc.predict(X_test) print("AccuracyScore: ", accuracy_score(predictions, Y_test)) print("recall_score: ", recall_score(predictions, Y_test, average='macro')) print("precision_score: ", precision_score(predictions, Y_test, average='macro'))