def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones((len(y),))) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y),)) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator \'knn\' does not support sample weights.') assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert_true('lr' in eclf1.named_estimators) assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1]) assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr']) eclf1.fit(X, y) assert_true('lr' in eclf1.named_estimators_) assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0]) assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_false(hasattr(eclf2, 'nb')) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params()) assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params()) eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0) assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5) assert_equal(eclf1.get_params()["lr__C"], eclf1.get_params()["lr"].get_params()['C'])
def acc_VotingClassifier(): kf = KFold(900, n_folds=10,shuffle=True) acc = 0.0 temp = 1 conf_mat = [[0 for i in range(10)] for j in range(10)] clf1 = GaussianNB() clf2 = RandomForestClassifier(n_estimators=20,max_features=None,class_weight="balanced_subsample") clf3 = SVC(kernel='rbf', probability=False) clf4 = LogisticRegression() eclf = VotingClassifier(estimators=[('gnb', clf1), ('rf', clf2), ('lr', clf4)], voting='hard', weights=[1,3,3]) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] eclf = eclf.fit(X_train, y_train) y_predict = eclf.predict(X_test) acc_loop = getAccuracy(y_predict,y_test) conf_mat = buildConfusionMatrix(conf_mat,y_predict,y_test) print("*** Accuracy*** for "+str(temp)+"th time: "+str(acc_loop)) acc += acc_loop temp +=1 # Checking if the data set is transformed into MFCC(13) or FFT(1000) or KPCA features(else) if (X.shape[1]==13): print 'In 13 features if' valid_mfcc = eclf.predict(validation_set_mfcc) elif (X.shape[1]==1000): print 'In 1000 features elif' valid_fft = eclf.predict(validation_set_fft) elif (X.shape[1]==100): print 'In KPCA features else' valid_kpca = eclf.predict(validation_set_kpca) acc = (acc/10.0) printConfusionMatrix(conf_mat) return acc, getAccuracyFromConfusion(conf_mat),valid_mfcc, valid_fft, valid_kpca
def classify(): train_X,Y = load_svmlight_file('data/train_last') test_X,test_Y = load_svmlight_file('data/test_last') train_X = train_X.toarray() test_X = test_X.toarray() Y = [int(y) for y in Y] # print 'Y:',len(Y) rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique() train_n = train_X.shape[0] m = train_X.shape[1] test_n = test_X.shape[0] print train_n,m,#test_n # 先用训练集训练出所有的分类器 print 'train classify...' clf1 = LinearDiscriminantAnalysis() clf2 = GaussianNB() clf3 = LogisticRegression() clf4 = RandomForestClassifier() clf5 = KNeighborsClassifier(n_neighbors=12) clf6 = AdaBoostClassifier() # x_train,x_test,y_train,y_test = train_test_split(train_X,Y,test_size=0.2) # 对训练集进行划分 # print x_train.shape # print x_test.shape # clf.fit(train_X,Y) clf = VotingClassifier(estimators=[('la',clf1),('nb',clf2),('lr',clf3),('rf',clf4),('nn',clf5),('ac',clf6)], voting='soft', weights=[1.5,1,1,1,1,1]) # clf1.fit(x_train,y_train) # clf2.fit(x_train,y_train) # clf3.fit(x_train,y_train) # clf4.fit(x_train,y_train) clf.fit(train_X,Y) print 'end train classify' print 'start classify....' # print metrics.classification_report(Y,predict_Y) # clf2.fit(train_X,Y) # print 'clf2 fited...' # clf3.fit(train_X,Y) # print 'clf3 fited...' # clf4.fit(train_X,Y) # print 'clf4 fited...' # clf1.fit(train_X,Y) # print 'clf1 fited...' # 第一个分类结果 predict_Y = clf.predict(train_X) # predict_Y = clf.predict(train_X) print 'classify result:' print metrics.classification_report(Y,predict_Y) predict_Y = clf.predict(test_X) # print predict_Y,len(predict_Y) print 'end classify...' # predict_Y = clf.predict(X[cnt_train:]) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric # predict_Y = clf.predict(test_X) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric DataFrame(predict_Y,index=rows).to_csv('data/info_test2.csv', header=False)
def test_predict_for_hard_voting(): # Test voting classifier with non-integer (float) prediction clf1 = FaultySVC(random_state=123) clf2 = GaussianNB() clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('fsvc', clf1), ('gnb', clf2), ('svc', clf3)], weights=[1, 2, 3], voting='hard') eclf1.fit(X, y) eclf1.predict(X)
def test_set_estimator_none(): """VotingClassifier set_params should be able to set estimators as None""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=None).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_true(dict(eclf2.estimators)["rf"] is None) assert_true(len(eclf2.estimators_) == 2) assert_true(all([not isinstance(est, RandomForestClassifier) for est in eclf2.estimators_])) assert_true(eclf2.get_params()["rf"] is None) eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = ('All estimators are None. At least one is required' ' to be a classifier!') assert_raise_message( ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=None).fit(X1, y1) assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_parallel_predict(): """Check parallel backend of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1).fit(X, y) eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def voting_class(X,training_target,Y): from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') eclf.fit(X[:,0:6],training_target) proba = eclf.predict_proba(Y[:,0:6]) eclf.predict()
def voting_fit(X, y, RESULT_TEST_PATH,RESULT_PATH): ada_best = fit_adaboost(X, y) extratree_best = fit_extratree(X, y) rf_best = fit_rf(X, y) gbdt_best = fit_xgboost(X, y) svc_best = fit_svc(X, y) lr_best = fit_lr(X, y) votingC = VotingClassifier(estimators=[('rfc', rf_best), ('extc', extratree_best),('lr',lr_best), ('adac', ada_best), ('gbc', gbdt_best)], voting='soft', n_jobs=4) votingC.fit(X, y) test_df = pd.read_csv(RESULT_TEST_PATH) test = np.array(test_df) #test_Survived = pd.Series(votingC.predict(test), name="Survived") result = votingC.predict(test) test_df.insert(test_df.columns.size, 'Survived', result) test_df = test_df[['PassengerId', 'Survived']] test_df['PassengerId'] = test_df['PassengerId'].apply(np.int64) test_df.to_csv(RESULT_PATH, index=False) print("finish!")
def predict(self,X_test): ''' predict the class for each sample ''' if self.use_append == True: self.__X_test = X_test elif self.use_append == False: temp = [] # first stage for clf in self.stage_one_clfs: y_pred = clf[1].predict(X_test) y_pred = np.reshape(y_pred,(len(y_pred),1)) if self.use_append == True: self.__X_test = np.hstack((self.__X_test,y_pred)) elif self.use_append == False: temp.append(y_pred) if self.use_append == False: self.__X_test = np.array(temp).T[0] # second stage majority_voting = VotingClassifier(estimators=self.stage_two_clfs, voting="hard", weights=self.weights) y_out = majority_voting.predict(self.__X_test) return y_out
def main(directory, tools_directory, non_tools_dir): global path path = sys.path[0] start = time.time() if directory is None or not os.path.isdir(directory): print "Please input directory containing pdf publications to classify" sys.exit(1) x_train, y_train = fetch_from_file() x_test, test_files = get_test_set(directory) # Just for testing, update machine learning part later x_train, x_test = normalize_scale(x_train, x_test) classifier = VotingClassifier( [("first", classifier_list[0]), ("second", classifier_list[1]), ("second", classifier_list[2])] ) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) if os.path.isdir(tools_directory): shutil.rmtree(tools_directory) os.makedirs(tools_directory) if os.path.isdir(non_tools_dir): shutil.rmtree(non_tools_dir) os.makedirs(non_tools_dir) for num, pub in zip(y_pred, test_files): if num: shutil.copy2(directory + pub, tools_directory + pub) else: shutil.copy2(directory + pub, non_tools_dir + pub) print "Classification: Seconds taken: " + str(time.time() - start)
def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones((len(y),))) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y),)) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) # check that an error is raised and indicative if sample_weight is not # supported. clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator KNeighborsClassifier does not support ' 'sample weights.') with pytest.raises(ValueError, match=msg): eclf3.fit(X, y, sample_weight) # check that _parallel_fit_estimator will raise the right error # it should raise the original error if this is not linked to sample_weight class ClassifierErrorFit(BaseEstimator, ClassifierMixin): def fit(self, X, y, sample_weight): raise TypeError('Error unrelated to sample_weight.') clf = ClassifierErrorFit() with pytest.raises(TypeError, match='Error unrelated to sample_weight'): clf.fit(X, y, sample_weight=sample_weight)
def main(path,filename): batchsT = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5'] batchsAux = ['histogramaByN','histogramaColor','patronesCirculaesByN_2_5','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5'] #for batch in batchsAux: #print batch batchs = batchsAux #batchs.remove(batch) X = [] y = [] load_batch(y,path,'clases',filename) y = [j for i in y for j in i] for batch in batchs: load_batch(X,path,batch,filename) #X,y = load_images('/tmp/train/') est = [RandomForest(),Boosting()] for i in xrange(0,15): est.append(Gradient(i)) for i in xrange(0,4): est.append(SVM(i)) #scores = cross_validation.cross_val_score(clf, X, y, cv=5) #print scores clf = VotingClassifier(estimators=est) clf.fit(X,y) pickle.dump( clf, open( "clf_grande.p", "wb" ) ) return X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, y, test_size=0.2,random_state=777) #print clf.sub_score(X_test,Y_test) print 'start' conf_matrix = metrics.confusion_matrix(Y_test,clf.predict(X_test)) print 'confution matrix' print conf_matrix return for name,estim in est: print name #estim.fit(X_train,Y_train) #print estim.score(X_test,Y_test) print cross_validation.cross_val_score(estim, X, y, cv=5,n_jobs=-1) print 'voter' print cross_validation.cross_val_score(clf, X, y, cv=5,n_jobs=-1) return #clf.fit(X_train,Y_train) print clf.score(X_test,Y_test) return
def classifier(self, scoring, cv, eval_using): adaclf = AdaBoostClassifier(algorithm='SAMME') xtr = StandardScaler().fit_transform(self.xtr) xte = StandardScaler().fit_transform(self.xte) # iterate over each grid score for param tuner for score in scoring: print('Tuning parameters of inital classifiers...') passive_params = param_tuner(PassiveAggressiveClassifier(), score=score, cv=cv, xtr=xtr, ytr=self.ytr) passclf = PassiveAggressiveClassifier().set_params(**passive_params) sgd_params = param_tuner(SGDClassifier(), score=score, cv=cv, xtr=xtr, ytr=self.ytr) sgdclf = SGDClassifier().set_params(**sgd_params) # cant use resampling/bagging with passive aggressive classifier # will raise ValueError: The number of class labels must be > 1 # since resampling may results in training sets with 1 class. print('\n'+'Tuning meta-classifiers with tuned classifier/s...') bagsgd_params = param_tuner(BaggingClassifier(sgdclf), score=score, cv=cv, xtr=xtr, ytr=self.ytr) bg_sgdclf = BaggingClassifier(sgdclf).set_params(**bagsgd_params) adasgd_params = param_tuner(adaclf.set_params(base_estimator=sgdclf), score =score, cv=cv, xtr=xtr, ytr=self.ytr) ada_sgdclf = adaclf.set_params(**adasgd_params) print('Voting on meta-classifiers/classifiers then predicting...') vote = VotingClassifier(estimators=[('BagSGD', bg_sgdclf), ('adaboostSGD', ada_sgdclf), ('Passive', passclf)], voting='hard').fit(xtr, self.ytr) start = time.time() y_true, y_pred = self.yte, vote.predict(xte) print('\n' + '-'*5, 'FINAL PREDICTION RESULTS','-'*5 +'\n', '{0:.4f}'.format(time.time()-start)+'--prediction time(secs)') clf_evaluation = report(*eval_using, y_true=y_true, y_pred=y_pred) for reports in clf_evaluation: print('---',reports) print(clf_evaluation[reports])
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())] ) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('Accuracy', confidence) predictions = clf.predict(X_test) print('Predicted spread:', Counter(predictions)) return confidence
def combine_voting_NB_classifier(X_train, X_test, y_train, y_test,X_train_meta, X_test_meta, y_train_meta, y_test_meta): from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import NearestCentroid from sklearn.ensemble import VotingClassifier clf_1 = BernoulliNB(alpha = 0.10000000000000001).fit(X_train_meta, y_train_meta) from sklearn.svm import SVC clf_2 = SVC(C=100, gamma=0.1).fit(X_train_meta, y_train_meta) clf_3 = NearestCentroid().fit(X_train_meta, y_train_meta) eclf = VotingClassifier(estimators=[('nb1', clf_1),('nb2', clf_3)], voting='hard') eclf = eclf.fit(X_train_meta, y_train_meta) y_voting_predicted = eclf.predict(X_test_meta) np.savetxt('oto_wyniki.csv',y_voting_predicted, delimiter=',') print "\n Here is the classification report for Voting classifier:" print metrics.classification_report(y_test_meta, y_voting_predicted)
def mutipleClf(label_clfset,data,features,votingType='soft',weight=[],testData=None,testFeatures=None): flag=False if weight==[]: flag=True; print "======================================\n" print ("Start at: "+time.strftime("%H:%M:%S")+"\n") if votingType=='soft': for label_clf in label_clfset: #use ten fold socore,set the cv to 10 scores = cross_validation.cross_val_score(label_clf[1], data, features, cv=10) if flag: weight.append(scores.mean()) eclf = VotingClassifier(estimators=label_clfset, voting=votingType, weights=weight) else: eclf = VotingClassifier(estimators=label_clfset, voting=votingType) result=eclf.fit(data,features) accuracy=0.0 if testData!=None: testResult=eclf.predict(testData) accuracy=getAccuracy(testResult,testFeatures) print ("End at: "+time.strftime("%H:%M:%S")+"\n") print "======================================\n" return result,accuracy
svm = SVC(C=10, gamma=0.01, probability=True, random_state=3) svm.fit(x_train, y_train) svm_pred = svm.predict(x_test) print(" [accuarcy]") print("tree : ", accuracy_score(y_test, dtree_pred)) print("random forest : ", accuracy_score(y_test, rf_pred)) print("knn : ", accuracy_score(y_test, knn_pred)) print("svm : ", accuracy_score(y_test, svm_pred)) # 하드 보팅 voting_clf = VotingClassifier(estimators=[('rforest', rf), ('knn', knn), ('svm', svm)], weights=[1, 1, 2], voting='hard').fit(x_train, y_train) hard_voting_predicted = voting_clf.predict(x_test) print(" ") print(" [ensemble] ") print(" hard voting accuracy : ", accuracy_score(y_test, hard_voting_predicted)) # 소프트 보팅 voting_clf = VotingClassifier(estimators=[('rforest', rf), ('knn', knn), ('svm', svm)], weights=[1, 1, 2], voting='soft').fit(x_train, y_train) soft_voting_predicted = voting_clf.predict(x_test) print(" soft voting accuracy : ", accuracy_score(y_test, soft_voting_predicted)) # 정확도 비교 시각화
("clf", best_classifier), ] ) vot_clf = VotingClassifier(estimators=[("glove", glove_clf), ("linear", svm_clf)], voting="soft") vot_clf.fit(train_data.Abstract, train_data.Stance) ######################### # Predict data # ######################### print ("Predicting labels") print ("Time used: {}".format((time.time() - start_time) / 60.0)) predictions = vot_clf.predict(unlabelled_data.Abstract) print predictions ######################### # Print distribution # ######################### against_c = 0 favor_c = 0 none_c = 0 for pred in predictions: if pred == "AGAINST": against_c += 1 elif pred == "FAVOR": favor_c += 1 else:
xg = XGBClassifier(n_estimators=60000, learning_rate=0.1, colsample_bytree=0.51007979, max_depth=7, min_child_weight=2) adarf_sub = RandomForestClassifier(n_estimators=30000, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1) ada_sub = AdaBoostClassifier(base_estimator=adarf_sub, n_estimators=13, learning_rate=0.8) vote = VotingClassifier([('rf', rf), ('et', et), ('xg', xg), ('ada', ada_sub)], voting='hard', weights=[1,1,1,2]) start_time = time.time() vote.fit(dataX, dataY) print("--- %.2f mins ---" % ((time.time() - start_time)/60)) os.system('say "Master, your program has finished"') # Predict data and write to file. vote_predict = vote.predict(test_data) f = open("Ensemble3.csv", "w") f.write("Id,Prediction\n") for x in range(len(vote_predict)): f.write(str(x+1) + "," + str(int(vote_predict[x])) + "\n") f.close() os.system('say "Master, your file has been created."') datetime.datetime.now() #0.76014, CV: 0.712590355255 start_time = time.time() rf = RandomForestClassifier(n_estimators=1000, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1) rf_scores = cross_val_score(rf, dataX_scaled, dataY, cv=10, n_jobs=-1)
training_y = training_data['CategoryNumber'] # create a testing and validation set from the training_data train_x, test_x, train_y, test_y = cross_validation.train_test_split(training_x, training_y, test_size=0.1) estimators = 40 clf1 = BaggingClassifier(n_estimators=estimators) #30.5 clf2 = ExtraTreesClassifier(n_estimators=estimators) #30.1 clf3 = RandomForestClassifier(n_estimators=estimators) #31.1 #clf = clf3 clf = VotingClassifier(estimators=[('b', clf1), ('e', clf2), ('r', clf3)], voting='soft' ) clf.fit(train_x, train_y) predicted = clf.predict(test_x) np.savetxt("temp_predictions.txt", predicted ) print("Correct", sum(predicted == test_y)) print("Total", len(test_y)) print("Accuracy", sum(predicted == test_y) / len(test_y) * 100.) print("Uniques", len(np.unique(predicted))) # sanity check
# Tf–idf term weighting tfidf_trans = TfidfTransformer() tfidf_train = tfidf_trans.fit_transform(dtm_train) tfidf_test = tfidf_trans.fit_transform(dtm_test) # Training classifiers clf1 = RandomForestClassifier() clf2 = AdaBoostClassifier() clf3 = xgb.XGBClassifier() clf4 = KNeighborsClassifier() clf5 = DecisionTreeClassifier() eclf = VotingClassifier(estimators=[('rf', clf1), ('ab', clf2), ('gb', clf3), ('ls', clf4), ('dt', clf5)], voting='soft', weights=[1, 0.5, 1.5, 1, 1]) eclf.fit(dtm_train, cuisine_label) predict_result = eclf.predict(dtm_test) testdf['cuisine'] = le.inverse_transform(predict_result) predict_dict = dict(zip(testdf['id'], testdf['cuisine']) ) with open('predict_result_ensemble.csv', 'w') as csvfile: fieldnames = ['id', 'cuisine'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for key, value in predict_dict.iteritems(): writer.writerow({'id': key, 'cuisine': value}) print 'finished' #for clf, label in zip([clf1, clf2, clf3, eclf], ['Random Forest', 'Adaboost', 'Xgboost']): # scores = cross_validation.cross_val_score(clf, tfidf_train, cuisine_label, cv=2, scoring='accuracy')
print('Model Trainng completed !!') print('\n') printProgressBar(100, l, prefix='Progress:', suffix='Complete', length=50) time.sleep(1) print('\n') print('Running Voting classifier ... Please Wait ...') print('\n') printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50) #voting classifier to determine the best classifier eclf1 = VotingClassifier(estimators=[('lg', clf1), ('rf', clf2), ('gnb', clf3), ('knc', clf4), ('dtc', clf5)], voting='soft') eclf1 = eclf1.fit(counts_train, y_train) pred = (eclf1.predict(counts_test)) printProgressBar(85, l, prefix='Progress:', suffix='Complete', length=50) print('\n') print('Classification completed !!') print('\n') printProgressBar(100, l, prefix='Progress:', suffix='Complete', length=50) time.sleep(1) print('\n') print('Calculating Prediction Accuracy and writing output file ...') print('\n') printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50) #Saving all the predicted data in csv file name predictions.csv L1 = le.inverse_transform(pred) pred_df = pd.DataFrame(L1)
# make the prediction y_pred_bag = bagging_clf.predict(X_test) # calculate the accuracy accuracy = accuracy_score(y_test, y_pred_bag) print(accuracy) # -------------- # import packages from sklearn.ensemble import VotingClassifier from sklearn.naive_bayes import GaussianNB # code starts here nv = GaussianNB() # fit the classifier on X_train,y_train nv.fit(X_train, y_train) voting_clf_soft = VotingClassifier([('lr', lr), ('rf', rf), ('nv', nv)], voting='soft') voting_clf_soft.fit(X_train, y_train) # make the prediction y_pred_soft = voting_clf_soft.predict(X_test) # calculate the accuracy accuracy_soft = accuracy_score(y_test, y_pred_soft) print(accuracy_soft) # code ends here
import numpy as np clf = [KNeighborsClassifier(n_neighbors=i) for i in range(1, 11)] Multi = GaussianNB() errort = [] error = [] #print(clf[0:2]) for j in range(0, 10): print(j) enf = VotingClassifier([('nb', Multi), ('knn', clf[j])], voting='soft', weights=[1, 8]) #enf= VotingClassifier([('%d'% j, c) for c in clf][0:j],voting='hard') enf.fit(data.trainvector, data.trainlabels) errort.append(zero_one_loss(data.testlabels, enf.predict(data.testvector))) error.append(zero_one_loss(data.trainlabels, enf.predict(data.trainvector))) print(error, errort) with open( "/home/amrita95/Desktop/Machine learning with networks/assignments/ensemble2test.txt", 'w') as file: for e in errort: file.write("%f\n" % e) with open( "/home/amrita95/Desktop/Machine learning with networks/assignments/ensemble2train.txt", 'w') as file: for e in error: file.write("%f\n" % e)
from sklearn.metrics import accuracy_score accuracy_count = accuracy_score(y_test_count, predictions_count) print('Count Vectorized Words Accuracy:', accuracy_count) #Ensembling acc= 0.9569 clf1 = LogisticRegression(random_state=10) clf2 = RandomForestClassifier(n_estimators=1000, max_features=17, criterion='entropy', random_state=0) clf3 = GradientBoostingClassifier(n_estimators=1000, random_state=10) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3)], voting='hard') #Ensembling predictions eclf.fit(X_train_count, y_train_count) predictions_count = eclf.predict(X_test_count) # cross validation with kfold = 10 from sklearn.cross_validation import cross_val_score accuracies = cross_val_score(estimator=eclf, X=X_train_count, y=y_train_count, cv=10) print('Ensemble Mean Accuracy', accuracies.max()) ### Function to create confusion matrix ### import itertools def plot_confusion_matrix(cm, classes,
voting.fit(X_train, y_train) y_pred_proba = voting.predict_proba(X_test) fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1], pos_label=1) auc_score = str(round(auc(fpr, tpr), 5)) print('AUC score: {}'.format(auc_score)) plot_roc_curve(fpr, tpr) ''' Hard voting classifier ''' voting_model = VotingClassifier(voting='hard', estimators=[ ('xgb', xgb_model), ('logit', lr_model), ('svm', svc_model), ]) voting_model.fit(X_train, y_train) y_pred_proba = voting.predict(X_test) fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1) auc_score = str(round(auc(fpr, tpr), 5)) print('AUC score: {}'.format(auc_score))
from sklearn.linear_model import LogisticRegression from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split dataset=make_moons(n_samples=5000,noise=0.5) X=dataset[0] y=dataset[1] X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42) log_clf=LogisticRegression() rnd_clf=RandomForestClassifier() svm_clf=SVC(probability=True) voting_clf=VotingClassifier(estimators=[('lr',log_clf),('rnd',rnd_clf),('svm',svm_clf)],voting='hard') voting_clf.fit(X_train,y_train) #%% from sklearn.metrics import accuracy_score y_pred=voting_clf.predict(X_test) accuracy_score(y_test,y_pred) #%% for clf in [log_clf,rnd_clf,svm_clf]: clf.fit(X_train,y_train) y_pred=clf.predict(X_test) print(clf.__class__.__name__, accuracy_score(y_test,y_pred)) #%% # 接下来的代码训练了一个 500 个决策树分类器的集成,每一个都 # 是在数据集上有放回采样 100 个训练实例下进行训练(这是 Bagging 的例子,如果你想尝试 # Pasting,就设置 bootstrap=False )。 n_jobs 参数告诉 sklearn 用于训练和预测所需要 CPU # 核的数量。(-1 代表着 sklearn 会使用所有空闲核): from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier
def optimise_train_model(X, XX, YY, error_selector, test_size=0.3, print_conf_mx=True, plot_final_conf_mx=True, plot_all_conf_mx=True, savefigs=True, pickle_model=False): # Function splits the data into training and test sets, then tests the # performance of a range of models on the training data. The final model # selected is then evaluated using the test set. The function automatically # selects the model that performs best on the training sets. All # performance metrics are printed to ipython. The performance metric # to use to select the model is determined in the function call. Options # for error_selector are: 'accuracy', 'F1', 'recall', 'precision', and # 'average_all_metric' # The option 'plot_all_conf_mx' can be se to True or False. If True, the # train set confusion matrices will be plotted for all models. If False, # only the final model confusion matrix will be plotted. # X, XX, YY are the datasets with and without labels. # split data into test and train sets. X_train, X_test, Y_train, Y_test = model_selection.train_test_split( XX, YY, test_size=test_size) # test different classifers and report performance metrics using traning data only # 1. Try Naive Bayes clf_NB = GaussianNB() clf_NB.fit(X_train, Y_train) accuracy_NB = clf_NB.score(X_train, Y_train) # calculate accuracy Y_predict_NB = clf_NB.predict(X_train) # make nre prediction conf_mx_NB = confusion_matrix(Y_train, Y_predict_NB) # calculate confusion matrix recall_NB = recall_score(Y_train, Y_predict_NB, average="weighted") f1_NB = f1_score(Y_train, Y_predict_NB, average="weighted") # calculate f1 score precision_NB = precision_score(Y_train, Y_predict_NB, average='weighted') average_metric_NB = (accuracy_NB + recall_NB + f1_NB) / 3 # 2. Try K-nearest neighbours clf_KNN = neighbors.KNeighborsClassifier() clf_KNN.fit(X_train, Y_train) accuracy_KNN = clf_KNN.score(X_train, Y_train) Y_predict_KNN = clf_KNN.predict(X_train) conf_mx_KNN = confusion_matrix(Y_train, Y_predict_KNN) recall_KNN = recall_score(Y_train, Y_predict_KNN, average="weighted") f1_KNN = f1_score(Y_train, Y_predict_KNN, average="weighted") precision_KNN = precision_score(Y_train, Y_predict_KNN, average='weighted') average_metric_KNN = (accuracy_KNN + recall_KNN + f1_KNN) / 3 # 3. Try support Vector Machine with best params calculated using # GridSearch cross validation optimisation tuned_parameters = [{ 'kernel': ['linear'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4], 'C': [0.1, 1, 10, 100, 1000, 10000] }, { 'kernel': ['rbf'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4], 'C': [0.1, 1, 10, 100, 1000, 10000] }, { 'kernel': ['poly'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4], 'C': [0.1, 1, 10, 100, 1000, 10000] }, { 'kernel': ['sigmoid'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4], 'C': [0.1, 1, 10, 100, 1000, 10000] }] clf_svm = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=3) clf_svm.fit(X_train, Y_train) print() print("Best parameters set found on development set:") print(clf_svm.best_params_) print() # line break kernel = clf_svm.best_estimator_.get_params()['kernel'] C = clf_svm.best_estimator_.get_params()['C'] gamma = clf_svm.best_estimator_.get_params()['gamma'] clf_svm = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True) clf_svm.fit(X_train, Y_train) accuracy_svm = clf_svm.score(X_train, Y_train) Y_predict_svm = clf_svm.predict(X_train) conf_mx_svm = confusion_matrix(Y_train, Y_predict_svm) recall_svm = recall_score(Y_train, Y_predict_svm, average="weighted") f1_svm = f1_score(Y_train, Y_predict_svm, average="weighted") precision_svm = precision_score(Y_train, Y_predict_svm, average='weighted') average_metric_svm = (accuracy_svm + recall_svm + f1_svm) / 3 # 4. Try a random forest classifier clf_RF = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=16, n_jobs=-1) clf_RF.fit(X_train, Y_train) accuracy_RF = clf_RF.score(X_train, Y_train) Y_predict_RF = clf_RF.predict(X_train) conf_mx_RF = confusion_matrix(Y_train, Y_predict_RF) recall_RF = recall_score(Y_train, Y_predict_RF, average="weighted") f1_RF = f1_score(Y_train, Y_predict_RF, average="weighted") precision_RF = precision_score(Y_train, Y_predict_RF, average='weighted') average_metric_RF = (accuracy_RF + recall_RF + f1_RF) / 3 # 5. Try an ensemble of all the other classifiers (not RF) using the voting classifier method ensemble_clf = VotingClassifier(estimators=[('NB', clf_NB), ('KNN', clf_KNN), ('svm', clf_svm), ('RF', clf_RF)], voting='hard') ensemble_clf.fit(X_train, Y_train) accuracy_ensemble = ensemble_clf.score(X_train, Y_train) Y_predict_ensemble = ensemble_clf.predict(X_train) conf_mx_ensemble = confusion_matrix(Y_train, Y_predict_ensemble) recall_ensemble = recall_score(Y_train, Y_predict_ensemble, average="weighted") f1_ensemble = f1_score(Y_train, Y_predict_ensemble, average="weighted") precision_ensemble = precision_score(Y_train, Y_predict_ensemble, average='weighted') average_metric_ensemble = (accuracy_ensemble + recall_ensemble + f1_ensemble) / 3 print() print('*** MODEL TEST SUMMARY ***') print('KNN accuracy = ', accuracy_KNN, 'KNN_F1_Score = ', f1_KNN, 'KNN Recall = ', recall_KNN, 'KNN precision = ', precision_KNN) print('Naive Bayes accuracy = ', accuracy_NB, 'Naive_Bayes_F1_Score = ', f1_NB, 'Naive Bayes Recall = ', recall_NB, 'Naive Bayes Precision = ', precision_NB) print('SVM accuracy = ', accuracy_svm, 'SVM_F1_Score = ', f1_svm, 'SVM recall = ', recall_svm, 'SVM Precision = ', precision_svm) print('Random Forest accuracy', accuracy_RF, 'Random Forest F1 Score = ', f1_RF, 'Random Forest Recall', recall_RF, 'Random Forest Precision = ', precision_RF) print('Ensemble accuracy', accuracy_ensemble, 'Ensemble F1 Score = ', f1_ensemble, 'Ensemble Recall', recall_ensemble, 'Ensemble Precision = ', precision_ensemble) # PLOT CONFUSION MATRICES if plot_all_conf_mx: fig = plt.figure(figsize=(15, 15)) ax1 = fig.add_subplot(321) ax1.imshow(conf_mx_NB), plt.title( 'NB Model Confusion Matrix'), plt.colorbar classes = clf_NB.classes_ tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes, rotation=45) ax2 = fig.add_subplot(322) ax2.imshow(conf_mx_KNN, cmap=plt.cm.gray), plt.title( 'KNN Model Confusion Matrix'), plt.colorbar, classes = clf_svm.classes_ tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes, rotation=45) ax3 = fig.add_subplot(323) ax3.imshow( conf_mx_svm, cmap=plt.cm.gray), plt.title('SVM Confusion Matrix'), plt.colorbar, classes = clf_svm.classes_ tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes, rotation=45) ax4 = fig.add_subplot(324) ax4.imshow(conf_mx_RF, cmap=plt.cm.gray), plt.title( 'Random Forest Confusion Matrix'), plt.colorbar, classes = clf_svm.classes_ tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes, rotation=45) ax5 = fig.add_subplot(325) ax5.imshow(conf_mx_ensemble, cmap=plt.cm.gray), plt.title( 'voting Ensemble Confusion Matrix'), plt.colorbar, classes = clf_svm.classes_ tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes, rotation=45) plt.tight_layout() if savefigs: plt.savefig(str(savefig_path + 'confusion_matrices.jpg')) plt.show() print() # line break if error_selector == 'accuracy': if accuracy_KNN > accuracy_svm and accuracy_KNN > accuracy_NB and accuracy_KNN > accuracy_RF and accuracy_KNN >\ accuracy_ensemble: clf = neighbours.KNeighboursClassifier() clf.fit(X_train, Y_train) print('KNN model chosen') elif accuracy_NB > accuracy_KNN and accuracy_NB > accuracy_svm and accuracy_NB > accuracy_RF and accuracy_NB > \ accuracy_ensemble: clf = clf_NB clf.fit(X_train, Y_train) print('Naive Bayes model chosen') elif accuracy_svm > accuracy_NB and accuracy_svm > accuracy_KNN and accuracy_KNN > accuracy_RF and accuracy_svm\ > accuracy_ensemble: clf = clf_svm clf.fit(X_train, Y_train) print('SVM model chosen') print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ', kernel) elif accuracy_RF > accuracy_NB and accuracy_RF > accuracy_KNN and accuracy_RF > accuracy_ensemble and \ accuracy_RF > accuracy_svm: clf = clf_RF clf.fit(X_train, Y_train) print('RF model chosen') elif accuracy_ensemble > accuracy_svm and accuracy_ensemble > accuracy_NB and accuracy_ensemble > accuracy_RF \ and accuracy_ensemble > accuracy_KNN: clf = clf_ensemble clf.fit(X_train, Y_train) print('Ensemble model chosen') elif error_selector == 'recall': if recall_KNN > recall_svm and recall_KNN > recall_NB and recall_KNN > recall_RF and recall_KNN > \ recall_ensemble: clf = neighbours.KNeighboursClassifier() clf.fit(X_train, Y_train) print('KNN model chosen') elif recall_NB > recall_KNN and recall_NB > recall_svm and recall_NB > recall_RF and recall_NB > \ recall_ensemble: clf = GaussianNB() clf.fit(X_train, Y_train) print('Naive Bayes model chosen') elif recall_svm > recall_NB and recall_svm > recall_KNN and recall_svm > recall_RF and recall_svm > \ recall_ensemble: clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True) clf.fit(X_train, Y_train) print('SVM model chosen') print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ', kernel) elif recall_RF > recall_NB and recall_RF > recall_KNN and recall_RF > recall_ensemble and \ recall_RF > recall_svm: clf = clf_RF clf.fit(X_train, Y_train) print('RF model chosen') elif recall_ensemble > recall_svm and recall_ensemble > recall_NB and recall_NB > recall_RF and \ recall_ensemble > recall_KNN: clf = VotingClassifier(estimators=[('NB', clf_NB), ('SVM', clf_svm), ('KNN', clf_KNN)], voting='hard') clf.fit(X_train, Y_train) print('Ensemble model chosen') elif error_selector == 'F1': if f1_KNN > f1_svm and f1_KNN > f1_NB and f1_KNN > f1_RF and f1_KNN > f1_ensemble: clf = neighbours.KNeighboursClassifier() clf.fit(X_train, Y_train) print('KNN model chosen') elif f1_NB > f1_KNN and f1_NB > f1_svm and f1_NB > f1_RF and f1_NB > f1_ensemble: clf = GaussianNB() clf.fit(X_train, Y_train) print('Naive Bayes model chosen') elif f1_svm > f1_NB and f1_svm > f1_KNN and f1_svm > f1_RF and f1_svm > f1_ensemble: clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True) clf.fit(X_train, Y_train) print('SVM model chosen') print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ', kernel) elif f1_RF > f1_NB and f1_RF > f1_KNN and f1_RF > f1_ensemble and f1_RF > f1_svm: clf = clf_RF clf.fit(X_train, Y_train) print('RF model chosen') elif f1_ensemble > f1_svm and f1_ensemble > f1_NB and f1_ensemble > f1_RF and f1_ensemble > f1_KNN: clf = VotingClassifier(estimators=[('NB', clf_NB), ('SVM', clf_svm), ('KNN', clf_KNN)], voting='hard') clf.fit(X_train, Y_train) print('Ensemble model chosen') elif error_selector == 'precision': if precision_KNN > precision_svm and precision_KNN > precision_NB and precision_KNN > precision_RF \ and precision_KNN > precision_ensemble: clf = neighbours.KNeighboursClassifier() clf.fit(X_train, Y_train) print('KNN model chosen') elif precision_NB > precision_KNN and precision_NB > precision_svm and precision_NB > precision_RF \ and precision_NB > precision_ensemble: clf = GaussianNB() clf.fit(X_train, Y_train) print('Naive Bayes model chosen') elif precision_RF > precision_NB and precision_RF > precision_KNN and precision_RF > precision_ensemble \ and precision_RF > precision_svm: clf = clf_RF clf.fit(X_train, Y_train) print('RF model chosen') elif precision_svm > precision_NB and precision_svm > precision_KNN and precision_svm > precision_RF \ and precision_svm > precision_ensemble: clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True) clf.fit(X_train, Y_train) print('SVM model chosen') print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ', kernel) elif precision_ensemble > precision_svm and precision_ensemble > precision_NB and precision_ensemble > \ precision_RF and precision_ensemble > precision_KNN: clf = VotingClassifier(estimators=[('NB', clf_NB), ('SVM', clf_svm), ('KNN', clf_KNN)], voting='hard') clf.fit(X_train, Y_train) print('Ensemble model chosen') elif error_selector == 'average_all_metric': if average_metric_KNN > average_metric_svm and average_metric_KNN > average_metric_NB and average_metric_KNN > \ average_metric_RF and average_metric_KNN > average_metric_ensemble: clf = neighbours.KNeighboursClassifier() clf.fit(X_train, Y_train) print('KNN model chosen') elif average_metric_NB > average_metric_KNN and average_metric_NB > average_metric_svm and average_metric_NB > \ average_metric_RF and average_metric_NB > average_metric_ensemble: clf = GaussianNB() clf.fit(X_train, Y_train) print('Naive Bayes model chosen') elif average_metric_RF > average_metric_NB and average_metric_RF > average_metric_KNN and average_metric_RF > \ average_metric_ensemble and average_metric_RF > average_metric_svm: clf = clf_RF clf.fit(X_train, Y_train) print('RF model chosen') elif average_metric_svm > average_metric_NB and average_metric_svm > average_metric_KNN and average_metric_svm \ > average_metric_RF and average_metric_svm > average_metric_ensemble: clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True) clf.fit(X_train, Y_train) print('SVM model chosen') print('SVM Params: C = ', C, ' gamma = ', gamma, ' kernel = ', kernel) elif average_metric_ensemble > average_metric_svm and average_metric_ensemble > average_metric_NB and \ average_metric_ensemble > average_metric_RF and average_metric_ensemble > average_metric_KNN: clf = VotingClassifier(estimators=[('NB', clf_NB), ('SVM', clf_svm), ('KNN', clf_KNN)], voting='hard') clf.fit(X_train, Y_train) print('Ensemble model chosen') # Now that model has been selected using error metrics from training data, the final # model can be evaluated on the test set. The code below therefore measures the f1, recall, # confusion matrix and accuracy for the final selected model and prints to console. Y_test_predicted = clf.predict(X_test) final_conf_mx = confusion_matrix(Y_test, Y_test_predicted) # calculate normalised confusion matrix row_sums = final_conf_mx.sum(axis=1, keepdims=True) norm_conf_mx = final_conf_mx / row_sums np.fill_diagonal(norm_conf_mx, 0) # plot confusion matrices as subplots in a single figure if plot_final_conf_mx == True: fig = plt.figure(figsize=(10, 10)) ax1 = fig.add_subplot(211) ax1.imshow(final_conf_mx), plt.title( 'Final Confusion Matrix'), plt.colorbar classes = clf.classes_ tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes, rotation=45) ax2 = fig.add_subplot(212) ax2.imshow(norm_conf_mx, cmap=plt.cm.gray), plt.title( 'Normalised Confusion Matrix'), plt.colorbar, plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes, rotation=45) plt.tight_layout() if savefigs: plt.savefig( str(savefig_path + "final_model_confusion_matrices.jpg")) plt.show() # calculate performance measures for final model final_recall = recall_score(Y_test, Y_test_predicted, average="weighted") final_f1 = f1_score(Y_test, Y_test_predicted, average="weighted") final_accuracy = clf.score(X_test, Y_test) final_precision = precision_score(Y_test, Y_test_predicted, average='weighted') final_average_metric = (final_recall + final_accuracy + final_f1) / 3 if print_conf_mx: print('Final Confusion Matrix') print(final_conf_mx) print() print('Normalised Confusion Matrix') print(norm_conf_mx) # The Feature importances print() print('Feature Importances') print('(relative importance of each feature (wavelength) for prediction)') print() for name, score in zip(X.columns, clf.feature_importances_): print(name, score) print() # line break print('*** FINAL MODEL SUMMARY ***') print('Final Model Accuracy = ', final_accuracy) print('Final Model Recall = ', final_recall) print('Final Model F1 = ', final_f1) print('Final Model Precision = ', final_precision) print('Final Model Average metric = ', final_average_metric) if pickle_model: # pickle the classifier model for archiving or for reusing in another code joblibfile = 'UAV_classifier.pkl' joblib.dump(clf, joblibfile) # to load this classifier into another code use the following syntax: # clf = joblib.load(joblib_file) return clf, final_conf_mx, norm_conf_mx
#predict using only adaboost sev = [] for i in range(2500): lt = [] for k in range(10): lt.append(X_final[i][k]) ans = ada_best.predict(sc.transform(np.array([lt]))) sev.append((ids[i], ans[0])) answers3 = [] for i in range(2500): answers3.append(sev[i][1]) #predict using ensemble sev = [] for i in range(2500): lt = [] for k in range(10): lt.append(X_final[i][k]) ans = votingC.predict(sc.transform(np.array([lt]))) sev.append((ids[i], ans[0])) answers4 = [] for i in range(2500): answers4.append(sev[i][1])
def the_voting(N, X_ALL, X_ALL_val, y_ALL, y_ALL_val, pipe_list, DS, CLF): y_ALL = y_ALL.ravel() y_ALL_val = y_ALL_val.ravel() if N == 3: eclf = VotingClassifier(estimators=[ (CLF[0] + '+' + DS[0], pipe_list[0]), (CLF[1] + '+' + DS[1], pipe_list[1]), (CLF[2] + '+' + DS[2], pipe_list[2]) ], voting='hard', n_jobs=-1) print(' Sanity check [Ensemble]') print(' -> ALL Together') eclf.fit(X_ALL, y_ALL) print(eclf.score(X_ALL_val, y_ALL_val)) y_true, y_pred = y_ALL_val, eclf.predict(X_ALL_val) CM = confusion_matrix(y_true, y_pred) report = classification_report(y_true, y_pred) print(report) print(CM) print(' --> Validation Accuracy:') e_val_score = accuracy_score(y_true, y_pred) print(' ' + "{:.2%}".format(accuracy_score(y_true, y_pred))) acc = [] labels = [] ## THIS IS FROM sklearn. (sidenote) for clf__, label in zip( [pipe_list[0], pipe_list[1], pipe_list[2], eclf], [ DS[0] + '+' + CLF[0], DS[1] + '+' + CLF[1], DS[2] + '+' + CLF[2], 'Ensemble' ]): scores = cross_val_score(clf__, X_ALL, y_ALL, scoring='accuracy', cv=5) print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) acc.append(scores.mean()) labels.append(label) return eclf, eclf.score( X_ALL_val, y_ALL_val), acc, labels, e_val_score, CM, report else: ## N==2: # ensemble/voting classifier where clf1 fitted with df1 and clf2 fitted with df2 eclf = VotingClassifier(estimators=[ (CLF[0] + '+' + DS[0], pipe_list[0]), (CLF[1] + '+' + DS[1], pipe_list[1]) ], voting='hard', n_jobs=-1) print(' Sanity check [Ensemble]') print(' -> ALL Together') eclf.fit(X_ALL, y_ALL) print(eclf.score(X_ALL_val, y_ALL_val)) y_true, y_pred = y_ALL_val, eclf.predict(X_ALL_val) CM = confusion_matrix(y_true, y_pred) report = classification_report(y_true, y_pred) print(report) print(CM) print(' --> Validation Accuracy:') e_val_score = accuracy_score(y_true, y_pred) print(' ' + "{:.2%}".format(accuracy_score(y_true, y_pred))) acc = [] labels = [] ## THIS IS FROM sklearn. (sidenote) for clf__, label in zip( [pipe_list[0], pipe_list[1], eclf], [DS[0] + '+' + CLF[0], DS[1] + '+' + CLF[1], 'Ensemble']): scores = cross_val_score(clf__, X_ALL, y_ALL, scoring='accuracy', cv=5) print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) acc.append(scores.mean()) labels.append(label) return eclf, eclf.score( X_ALL_val, y_ALL_val), acc, labels, e_val_score, CM, report
Logistic hyper- C, penalty , multiclass SVC- C, kernel, gamma 6. Voting Classifier uses ensemble technique to combine multiple predictors(Same data multiple estimators) from sklearn.ensemble import VotingClassifier lr = LogisticRegression(random_state=SEED) knn = KNN(n_neighbors=27) dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED) # Define the list classifiers classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)] vc = VotingClassifier(estimators=classifiers) vc.fit(X_train, y_train) y_pred = vc.predict(X_test)# pass the list of tuples that have the indivisual estimators 7. BaggingClassifier (same estimator , multiple dataset created using bootstrap aggregation) dt = DecisionTreeClassifier(random_state=1) bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1) bc.fit(X_train, y_train) y_pred = bc.predict(X_test) acc_test = accuracy_score(y_test, y_pred) print('Test set accuracy of bc: {:.2f}'.format(acc_test)) 8. You can estimate the performance of the enseble model using Out of Bag instances as on an average 63% of training samples are sampled at any time and 37% constitute the OOB instances when using bootstraping . The model gets trained on the bootstraped samples and evaluated on the OOB and then you average out the OOB
voting='hard').set_params(n_jobs=10).fit(train_data[train_data.columns[2:]], train_data['cancer_type_id']) clfs = [clf1, clf2, clf3, clf3, clf5] for i in [1, 2, 3, 4, 5, 6]: locals()["predict{}_valid_proba".format(i)] = locals()["clf{}".format(i)].predict_proba(valid_data[train_data.columns[2:]]) locals()["classifier{}_valid".format(i)] = pd.DataFrame(locals()["predict{}_valid_proba".format(i)], index = valid_data.index) locals()["classifier{}_valid".format(i)].insert(0 , "true_type", valid_data['cancer_type_id']) locals()["predict{}_test".format(i)] = locals()["clf{}".format(i)].predict(test_data[train_data.columns[2:]]) locals()["accuracy{}_test".format(i)] = (locals()["predict{}_test".format(i)] == test_data['cancer_type_id']).mean() locals()["predict_proba{}".format(i)] = locals()["clf{}".format(i)].predict_proba(test_data[train_data.columns[2:]]) locals()["classifier{}_test".format(i)] = pd.DataFrame(locals()["predict_proba{}".format(i)], index = test_data.index) locals()["classifier{}_test".format(i)].insert(0 , "true_type", test_data['cancer_type_id']) predict7_test = clf7.predict(test_data[train_data.columns[2:]]) accuracy7_test = (predict7_test == test_data['cancer_type_id']).mean() ## the weight of performance weighted voting from sklearn.preprocessing import LabelBinarizer encoder = LabelBinarizer(sparse_output = False) valid_one_hot = encoder.fit_transform(valid_data['cancer_type_id']) valid_one_hot = pd.DataFrame(valid_one_hot, index = valid_data.index) test_one_hot = encoder.fit_transform(test_data['cancer_type_id']) test_one_hot = pd.DataFrame(test_one_hot, index = test_data.index)
def main(logger=None): ''' Main routine to call the entire process flow ''' # Load_Dataset --- Process starts logger.info(f'') logger.info(f'{"-"*20} Load dataset starts here {"-"*20}') logger.info(f'') # TODO: DONE; Load Cancer dataset; cancer_data_dict = datasets.load_breast_cancer() cancer_data_pd = convert2pandas_df( x_array=cancer_data_dict['data'], y=[ cancer_data_dict['target_names'][i] for i in cancer_data_dict['target'] ], # feature_names=iris_dict['feature_names'], feature_names=list(cancer_data_dict['feature_names']), target_name='Target') # logger.info(f'{cancer_data_pd.head()}'); sns.lmplot(x="area error", y="compactness error", data=cancer_data_pd, fit_reg=False, hue='Target', legend=False, palette=dict(malignant="#BF0C2B", benign="#02173E")) # , versicolor="#F5900E")); plt.legend(loc='lower right') chart_save_image(plt=plt, f_size=(8, 8), left=0.125, right=0.9, bottom=0.125, top=0.9, wspace=0.0, hspace=0.0, fileName='./Cancer_Data_Plot.png') selected_columns = [ 'Target', 'mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry' ] g = sns.pairplot(cancer_data_pd[selected_columns], hue="Target", diag_kind="kde", palette=dict(malignant="#BF0C2B", benign="#02173E"), diag_kws=dict(shade=True)) for i, j in zip(*np.triu_indices_from(g.axes, 1)): g.axes[i, j].set_visible(False) chart_save_image(plt=plt, f_size=(16, 16), left=0.05, right=0.97, bottom=0.05, top=0.97, wspace=0.02, hspace=0.02, fileName='./Cancer_Data_PairPlot.png') logger.info(f'') logger.info(f'{"-"*20} Load dataset ends here {"-"*20}') logger.info(f'') # Load_Dataset --- Process ends # __Placeholder__ --- Process Starts # TODO: DONE; 001; Train test split; stratified; X_train, X_test, y_train, y_test = train_test_split( cancer_data_pd[cancer_data_dict.feature_names], # cancer_data_pd['Target'], cancer_data_dict['target' ], # Has to be binary for scorer F1 and Percision; test_size=0.20, # stratify=cancer_data_pd['Target'], stratify=cancer_data_dict['target'], random_state=111, shuffle=True) logger.info(f'X_train.shape : {X_train.shape}') logger.info(f'X_test.shape : {X_test.shape}') logger.info(f'Y_train.shape : {y_train.shape}') logger.info(f'Y_test.shape : {y_test.shape}') # TODO: DONE; 002; Dummy Classifier ; # dummy_classifier = DummyClassifier(strategy="stratified"); dummy_classifier = DummyClassifier(strategy="most_frequent") # TODO: DONE; 003; Cross_over_score and predict and Metrics (make_scorer) accuracy_scorer = make_scorer(cost_accuracy, greater_is_better=True) kfold = model_selection.KFold(n_splits=10, random_state=111) # results = model_selection.cross_val_score(dummy_classifier, X_train, y_train, cv=kfold, scoring='accuracy'); # logger.info(f'{results} {np.mean(results)} {np.var(results)} {np.std(results)}'); results = model_selection.cross_val_score(dummy_classifier, X_train, y_train, cv=kfold, scoring=accuracy_scorer) logger.info( f'{results} {np.mean(results)} {np.var(results)} {np.std(results)}') DummyClassifier_mean = np.mean(results) # TODO: DONE; 004; Standardization ; # std_scaler = preprocessing.StandardScaler(); # Contains the negative values std_scaler = preprocessing.MinMaxScaler() # Range between 0 to 1; No negative terms; std_scaler = std_scaler.fit(X_train) scaled_X_train = pd.DataFrame(std_scaler.transform(X_train), columns=X_train.columns) logger.info(f'{X_train["mean radius"].describe()}') logger.info(f'{scaled_X_train["mean radius"].describe()}') # TODO: DONE; 005; SelectKBest; Feature selection ; # selectKbest_est = SelectKBest(chi2, k=4); f_classif selectKbest_est = SelectKBest(f_classif, k=8) selectKbest_X_train = selectKbest_est.fit_transform(X_train, y_train) logger.info(f'{selectKbest_est.get_params(deep=True)}') logger.info(f'{selectKbest_est.get_support(indices=False)}') logger.info(f'{selectKbest_est.get_support(indices=True)}') logger.info( f'{X_train.columns[selectKbest_est.get_support(indices=True)]}') # TODO: DONE; 006; Polynomial Features ; poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False, interaction_only=False) X_train_poly = poly.fit_transform(X_train) X_train_p2 = pd.DataFrame(X_train_poly, columns=poly.get_feature_names(X_train.columns)) lr = linear_model.LogisticRegression(fit_intercept=False, random_state=111) results = model_selection.cross_val_score(lr, X_train_p2, y_train, cv=kfold, scoring=accuracy_scorer) # , verbose=True); imp_percentage = round( (np.mean(results) - DummyClassifier_mean) / DummyClassifier_mean, 4) logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}') logger.info(f'LogisticRegression accuracy : {np.mean(results)}') logger.info( f'The improvement over the DummyClassifier is : {imp_percentage}') # TODO: DONE; 007; Kernel PCA ; # kernel_param = ('rbf', 0.25); kernel_param = ('rbf', 1) kpca = KernelPCA(n_components=4, kernel=kernel_param[0], gamma=kernel_param[1], fit_inverse_transform=True, random_state=111) # n_jobs=-1, kpca.fit(scaled_X_train) # The data has to be scaled; kpca_X_train = kpca.transform(scaled_X_train) lr = linear_model.LogisticRegression(fit_intercept=False, random_state=111) results = model_selection.cross_val_score(lr, kpca_X_train, y_train, cv=kfold, scoring=accuracy_scorer) # , verbose=True); imp_percentage = round( (np.mean(results) - DummyClassifier_mean) / DummyClassifier_mean, 4) logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}') logger.info(f'LogisticRegression accuracy : {np.mean(results)}') logger.info( f'The improvement over the DummyClassifier is : {imp_percentage}') # TODO: DONE; 008; Grid-Search ; # tuned_parameters = [{ # 'n_estimators' : [1, 10, 100, 500, 1000, 2000], # 'max_depth' : [10, 20], # 'max_features' : [0.80, 0.40], # 'random_state' : [111] # }]; tuned_parameters = [{ 'n_estimators': [1, 10], 'max_depth': [10, 20], 'max_features': [0.80, 0.40], 'random_state': [111] }] clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring=accuracy_scorer) clf.fit(X_train, y_train) logger.info( f'Best parameters set found on development set: {clf.best_score_} {clf.best_params_}' ) logger.info('') logger.info('Grid scores on development set:') logger.info('') means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): logger.info(f'{round(mean,3)} (+/-{round(std*2,2)}) for {params}') logger.info('') logger.info('Detailed classification report:') logger.info('') logger.info('The model is trained on the full development set.') logger.info('The scores are computed on the full evaluation set.') logger.info('') y_true, y_pred = y_test, clf.predict(X_test) logger.info(f'{metrics.classification_report(y_true, y_pred)}') logger.info('') imp_percentage = round( (clf.best_score_ - DummyClassifier_mean) / DummyClassifier_mean, 4) logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}') logger.info( f'GridSearchCV RandomForestClassifier accuracy : {clf.best_score_}') logger.info( f'The improvement over the DummyClassifier is : {imp_percentage}') # logger.info(f'{clf.best_estimator_}'); # TODO: DONE; 009; Customer Transformer for the pipeline ; # reference : https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/ # http://philipmgoddard.com/modeling/sklearn_pipelines ctf = ColumnTypeFilter(np.number) ctf.fit_transform(X_train).head() # TODO: YTS; 010; Pipeline ; custom_pipeline = make_pipeline( FeatureUnion( transformer_list=[('StdScl', make_pipeline(ColumnTypeFilter(np.number), preprocessing.StandardScaler())), ('MMScl', make_pipeline(ColumnTypeFilter(np.number), preprocessing.MinMaxScaler()))])) custom_pipeline.fit(X_train) X_test_transformed = custom_pipeline.transform(X_test) logger.info( f'{X_test.shape} {type(X_test_transformed)} {X_test_transformed.shape}' ) # TODO: DONE; 011; Ensemble (VotingClassifier) and BaseClone; ensemble_clf = VotingClassifier( estimators=[ ('dummy', dummy_classifier), ('logistic', lr), # ('supportvector', SVC(probability=True)), ('randomforest', RandomForestClassifier()) ], voting='soft') ensemble_clf.fit(X_train, y_train) ensemble_clf_accuracy_ = cost_accuracy(y_test, ensemble_clf.predict(X_test)) imp_percentage = round( (ensemble_clf_accuracy_ - DummyClassifier_mean) / DummyClassifier_mean, 4) logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}') logger.info( f'GridSearchCV RandomForestClassifier accuracy : {ensemble_clf_accuracy_}' ) logger.info( f'The improvement over the DummyClassifier is : {imp_percentage}') # TODO: DONE; 012; One-hot encoder; Label Encoder; Binary Encoder; baby_names = ['Ava', 'Lily', 'Noah', 'Jacob', 'Mia', 'Sophia'] X_train_list = [np.random.choice(baby_names) for i in range(40)] X_test_list = [np.random.choice(baby_names) for i in range(6)] bb_labelencoder = preprocessing.LabelEncoder() bb_labelencoder.fit(X_train_list) bb_encoded = bb_labelencoder.transform(X_test_list) bb_onehotencoder = preprocessing.OneHotEncoder(sparse=False) bb_encoded = bb_encoded.reshape(len(bb_encoded), 1) bb_onehot = bb_onehotencoder.fit_transform(bb_encoded) for i, v in enumerate(X_test_list): logger.info( f'Actual : {v} \t | LabelEncoded : {bb_encoded[i][0]} \t | OneHot : {bb_onehot[i]}' ) # TODO: DONE; 013; Feature Extraction from image and text; corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) cntvector_out = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names()) for i, v in enumerate(corpus): logger.info(f'Input text : {v}') logger.info(f'Output counter vector : {v}') logger.info(f'{cntvector_out.iloc[i]}')
# -------- Predicting with Random Forest from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_jobs=-1, random_state=0) forest.fit(X_train, y_train) y_train_forest = forest.predict(X_train) y_pred_forest = forest.predict(X_test) print('Random Forest Train Score:', np.mean(y_train == y_train_forest)) print('Random Forest Test Score:', np.mean(y_test == y_pred_forest)) # -------- Predicting with Logistic Regression from sklearn.linear_model import LogisticRegression lr = LogisticRegression(C=0.1) # C controls the strength of regularization, smaller value, stronger regularization lr.fit(X_train, y_train) y_train_lr = lr.predict(X_train) y_pred_lr = lr.predict(X_test) print('Logistic Regression Train Score:', np.mean(y_train == y_train_lr)) print('Logistic Regression Test Score:', np.mean(y_test == y_pred_lr)) # -------- Predicting with Ensemble Voting based on the above classifiers from sklearn.ensemble import VotingClassifier eclf = VotingClassifier(estimators=[('xgboost', xgb), ('gbrt', gbrt), ('forest', forest), ('logistic regression', lr)], voting='soft', weights=None) #[2, 5, 2, 1]) # None: uses uniform weights eclf = eclf.fit(X_train, y_train) y_train_ensemble = eclf.predict(X_train) y_pred_ensemble = eclf.predict(X_test) print('Ensemble Voting Train Score:', np.mean(y_train == y_train_ensemble)) print('Ensemble Voting Test Score:', np.mean(y_test == y_pred_ensemble))
""" Better performance with a Voting Classifier Finally, you'll evaluate the performance of a voting classifier that takes the outputs of the models defined in the list classifiers and assigns labels by majority voting. X_train, X_test,y_train, y_test, the list classifiers defined in a previous exercise, as well as the function accuracy_score from sklearn.metrics are available in your workspace. INSTRUCTION ----------- Import VotingClassifier from sklearn.ensemble. Instantiate a VotingClassifier by setting the parameter estimators to classifiers and assign it to vc. Fit vc to the training set. Evaluate vc's test set accuracy using the test set predictions y_pred. """ # Import VotingClassifier from sklearn.ensemble from sklearn.ensemble import VotingClassifier # Instantiate a VotingClassifier vc vc = VotingClassifier(estimators=classifiers) # Fit vc to the training set vc.fit(X_train, y_train) # Evaluate the test set predictions y_pred = vc.predict(X_test) # Calculate accuracy score accuracy = accuracy_score(y_test, y_pred) print('Voting Classifier: {:.3f}'.format(accuracy))
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101) from sklearn.naive_bayes import MultinomialNB as MNB from sklearn.linear_model import LogisticRegression as LR from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.ensemble import VotingClassifier as VC mnb = MNB(alpha=10) lr = LR(random_state=101) rfc = RFC(n_estimators=80, criterion="entropy", random_state=42, n_jobs=-1) clf = VC(estimators=[('mnb', mnb), ('lr', lr), ('rfc', rfc)], voting='hard') clf.fit(X_train,y_train) predict = clf.predict(X_test) from sklearn.metrics import confusion_matrix, classification_report print(confusion_matrix(y_test, predict)) print('\n') print(classification_report(y_test, predict)) def predictor(s): s = vectorizer.transform(s) pre = clf.predict(s) print(pre) predictor(['I\'m on the Mexican, whoa oh oh, radio.'])
# print('Fact x: \n', qtable_X, '\n') # print('Fact y: \n', qtable_decisioned, '\n') clf1 = LogisticRegression(multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() model1 = clf1.fit(qtable_X, qtable_decisioned_X) model2 = clf2.fit(qtable_X, qtable_decisioned_X) model3 = clf3.fit(qtable_X, qtable_decisioned_X) eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') eclf1 = eclf1.fit(qtable_X, qtable_decisioned_Y) y_pred1 = eclf1.predict(qtable_Y) print(eclf1.predict(qtable_X)) np.array_equal(eclf1.named_estimators_.lr.predict(qtable_X), eclf1.named_estimators_['lr'].predict(qtable_X)) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') eclf2 = eclf2.fit(qtable_X, qtable_decisioned_Y) y_pred2 = eclf2.predict(qtable_Y) print(eclf2.predict(qtable_X)) eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft',
clf.predict(test_[cols]) preds = clf.predict_proba(test_[cols]) #print(confusion_matrix(test['class'], clf.predict(test[cols]))) print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"])) print (classification_report(test_['TripType'], clf.predict(test_[cols]))) score=accuracy_score(test_['TripType'],clf.predict(test_[cols])) table.append([score]) print (table) eclf = VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))), ('RandomForest', RandomForestClassifier(10)), ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))], voting='soft', weights=[7,1,1]) eclf.fit(train[cols], train["TripType"]) #use the classifier to predict predicted=eclf.predict(test[cols]) #print (accuracy_score(predicted,test['TripType'])) #print(classification_report(predicted,test['TripType'])) ''' OvR = OneVsRestClassifier(BaggingClassifier((LogisticRegression()))).fit(train_[cols], train_["TripType"]) predicted = OvR.predict(test_[cols]) print accuracy_score(predicted,test_['TripType']) rf = RandomForestClassifier() rf.fit(train_[cols], train_["TripType"]) predicted = rf.predict(test_[cols]) print accuracy_score(predicted,test_['TripType']) ada = AdaBoostClassifier() ada.fit(train_[cols], train_["TripType"])
m3 = GradientBoostingClassifier(loss='deviance', learning_rate=0.3, n_estimators=200, max_depth=7, min_samples_leaf=10, random_state=7, max_features=None, verbose=1) model = VotingClassifier(weights=[4, 5, 1], voting='soft', estimators=[('SVM', m1), ('Rnd Forest', m2), ('Grad Boost', m3)]) model = runModel(model=model, trainX=X_train[0:30000], trainY=y_train[0:30000], optimize=False, parameters=None, scoring='roc_auc') print "Applying Model ..." start = time() y_pred = model.predict(X_test) print("Model took %.2f seconds to predict vals" % (time() - start)) ### Evaluation print "Scoring Classifier..." start = time() score = model.score(X_test, y_test) recall = metrics.recall_score(y_test, y_pred, average='binary') auc = metrics.roc_auc_score(y_test, y_pred, average='macro') confusion = metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]) print "Score: \t \t Recall: \t AUC:\n", score, recall, auc print("Model took %.2f seconds to score" % (time() - start))
def AllModels (file, in_columns, out_columns): data = numpy.genfromtxt(file ,delimiter="," , autostrip = True ) data = data[2:] # numpy.asarray(numpy.random.shuffle(data[:2400])) array = data X = array[50:-50,in_columns] # print X X = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0).fit_transform(X) Y = array[50:-50,out_columns] #print X Y = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0).fit_transform(Y) # print Y validation_size = 0.2 #scoring = 'accuracy' # X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state = 0) X_train, X_validation, Y_train, Y_validation = X[0:2400], X[2400:], Y[0:2400], Y[2400:] # print X_train.pvalues_() lr = LogisticRegression() lr.fit(X_train, Y_train) predictions = lr.predict (X_validation) print 'LR : ' + str(accuracy_score(Y_validation, predictions)) lda = LinearDiscriminantAnalysis() lda.fit(X_train, Y_train) predictions = lda.predict (X_validation) print 'LDA: ' +str(accuracy_score(Y_validation, predictions)) knn = KNeighborsClassifier() knn.fit(X_train, Y_train) predictions = knn.predict (X_validation) print 'KNN: '+str(accuracy_score(Y_validation, predictions)) rf = DecisionTreeClassifier() rf.fit(X_train, Y_train) predictions = rf.predict (X_validation) print 'DT : ' +str(accuracy_score(Y_validation, predictions)) nb = GaussianNB() nb.fit(X_train, Y_train) predictions = nb.predict (X_validation) print 'NB : '+str(accuracy_score(Y_validation, predictions)) svm = SVC() svm.fit(X_train, Y_train) predictions = svm.predict (X_validation) print 'SVM: '+str(accuracy_score(Y_validation, predictions)) print '--------------------' rf=RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) rf.fit(X_train, Y_train) print 'rf: '+str(rf.score(X_validation,Y_validation)) et=ExtraTreesClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) et.fit(X_train, Y_train) print 'et: '+ str(et.score(X_validation,Y_validation)) #cnf_matrix = confusion_matrix(Y_validation, y_pred) #print cnf_matrix rf = [] for i in range(1,5): rf.append(ExtraTreesClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=i*6, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)) #cnf_matrix = confusion_matrix(Y_validation, y_pred) #print cnf_matrix l = [] for i in range(len(rf)): l.append((str(i),rf[i])) lda = LinearDiscriminantAnalysis() # l.append(('a',lda)) # l.append(('b',lda)) l.append(('c',lda)) l.append(('d',lda)) ecl = VotingClassifier(estimators = l, voting = 'hard') # ecl = AdaBoostClassifier(base_estimator = rf[0]) ecl.fit(X_train, Y_train) y_pred = ecl.predict(X_validation) ret = accuracy_score(Y_validation, y_pred) print ret cnf_matrix = confusion_matrix(Y_validation, y_pred,labels=[-3,-2,-1,0,1,2,3]) #print cnf_matrix s1 = 0.0 for i in cnf_matrix: s1 = s1 + sum(i) print '---------------' s = 0.0 for i in cnf_matrix[0:3,0:3]: s = s+sum(i) for i in cnf_matrix[4:7,4:7]: s = s+sum(i) print s/s1 return ret
# #clf = RandomForestClassifier() clf1 = RandomForestClassifier(max_depth=5, n_estimators=20, max_features=1) #clf2 = SVC(kernel="linear", C=0.025) #Kernel: linear, poly, rbf #clf2 = GradientBoostingClassifier(n_estimators=100) #clf3 = linear_model.LogisticRegression(C=1e5) clf2 = RandomForestClassifier(random_state=1) clf3 = RandomForestClassifier(random_state=2) print 'Testing... ' eclf1 = VotingClassifier(estimators=[('rf', clf1), ('svm', clf2), ('gb', clf3)], voting='hard') eclf1 = eclf1.fit(X_data, Y_data) predictions = eclf1.predict(x_data) #print y_data #print predictions #clf.fit(X_train, y_train) print classification_report(y_data, predictions) #predictions = clf.predict(X_test) #print y_test #print predictions #scores = cross_validation.cross_val_score(clf, x_data, targets_data, cv=5) #print "" #print "CV Scores....: ", scores #print "CV Mean score: ", sum(scores)/(len(scores)*1.0)
# ### 6.2 Ensemble modeling # #### 6.2.1 Combining models # # I choosed a voting classifier to combine the predictions coming from the 5 classifiers. # # I preferred to pass the argument "soft" to the voting parameter to take into account the probability of each vote. # In[75]: votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best), ('svc', SVMC_best), ('adac', ada_best), ('gbc', GBC_best)], voting='soft', n_jobs=4) votingC = votingC.fit(X_train, Y_train) # ### 6.3 Prediction # #### 6.3.1 Predict and Submit results # In[76]: test_Survived = pd.Series(votingC.predict(test), name="Survived") results = pd.concat([IDtest, test_Survived], axis=1) results.to_csv("ensemble_python_voting.csv", index=False) # If you found this notebook helpful or you just liked it , some upvotes would be very much appreciated - That will keep me motivated :)
# In[ ]: votingC = VotingClassifier(estimators=[('svc', SVMC_best), ('rfc', RFC_best), ('lrc', LRC_best)], voting='soft', n_jobs=4) votingC = votingC.fit(X_train, Y_train) # ### 6. Prediction and Submission # In[ ]: ''' test_Survived = pd.Series(votingC.predict(test), name="Survived") results = pd.concat([IDtest,test_Survived],axis=1) results.to_csv("Titanic_test_set_prediction.csv",index=False) ''' # In[ ]: test_Survived = votingC.predict(test).astype(int) submission = pd.DataFrame({"PassengerId": IDtest, "Survived": test_Survived}) submission.to_csv('Titanic_test_prediction_V9.csv', index=False) # In[ ]: accuracy_score(Y_train, votingC.predict(X_train))
row = C1.query(query) C = float(row['select_C']) clf.set_params(clf__C=C) w = float(row['select_mean']) weights.append(w) # set weight to mean of CV scores for selected C vot_clf.set_params(weights=weights) vot_clf.fit(target_train_data.Tweet, true_stances) # predict on test data index = test_data.Target == target test_tweets = test_data.loc[index, 'Tweet'] test_data.loc[index, 'Stance'] = vot_clf.predict(test_tweets) # predict on training data too to gauge overfitting index = train_data.Target == target train_tweets = train_data.loc[index, 'Tweet'] pred_stances = vot_clf.predict(train_tweets) print classification_report(true_stances, pred_stances, digits=4) macro_f = fbeta_score(true_stances, pred_stances, 1.0, labels=['AGAINST', 'FAVOR'], average='macro') print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format( macro_f)
# LR # lr = LogisticRegression() # lr.fit(tfidf_train, target_train) # lr_pred = lr.predict(tfidf_test) # 随机森林 # rf = RandomForestClassifier(random_state=1) # rf.fit(tfidf_train, target_train) # rf_pred=rf.predict(tfidf_test) # 组合分类器 # Voting vote = VotingClassifier(estimators=[('nb', bayes), ('dt', tre), ('svm', svc)], voting='hard') vote.fit(tfidf_train, target_train) vote_pred = vote.predict(tfidf_test) # Adaboost # ab = AdaBoostClassifier() # ab.fit(tfidf_train, target_train) # ab_pred = ab.predict(tfidf_test) # Bagging # bag = BaggingClassifier() # bag.fit(tfidf_train, target_train) # bag_pred = bag.predict(tfidf_test) # Gradient # MemoryError,已弃用 # gb = GradientBoostingClassifier() # gb.fit(tfidf_train, target_train) # gb_pred = gb.predict(tfidf_test.toarray()) # 非监督
# 随机森林 clf2 = RandomForestClassifier(n_estimators=50, max_depth=1, min_samples_split=4, min_samples_leaf=54, oob_score=True) clf2.fit(X_train, y_train) # 输出测试集的预测正确率 # print(clf2.score(X_test, y_test)) # print(confusion_matrix(y_test, clf2.predict(X_test))) print("*" * 100) # 决策树 tre = DecisionTreeClassifier(criterion='gini', splitter='best') tre.fit(X_train, y_train) # 输出测试集的预测正确率 # print(tre.score(X_test, y_test)) # print(confusion_matrix(y_test, tre.predict(X_test))) eclf = VotingClassifier(estimators=[('svcnl', tre), ('rf', clf2), ('svc', cls1)], voting='hard') eclf.fit(X_train, y_train) # 输出测试集的预测正确率 print("线性svm ", cls1.score(X_test, y_test)) print("非线性svm ", tre.score(X_test, y_test)) print("随机森林 ", clf2.score(X_test, y_test)) print("集成学习", eclf.score(X_test, y_test)) print(confusion_matrix(y_test, eclf.predict(X_test)))
def test_set_estimator_drop(): # VotingClassifier set_params should be able to set estimators as drop # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(rf='drop').fit(X, y) assert not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] == 'drop' assert len(eclf2.estimators_) == 2 assert all( isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] == 'drop' eclf1.set_params(voting='soft').fit(X, y) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(voting='soft').fit(X, y) assert not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are dropped. At least one is required' with pytest.warns(None) as record: with pytest.raises(ValueError, match=msg): eclf2.set_params(lr='drop', rf='drop', nb='drop').fit(X, y) assert not record # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(rf='drop').fit(X1, y1) assert not record assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def analisar_features(train_text, n_gram=1, pos=False, tags=False, dep=False, stem=False, remove_stop_words=False, remove_punct=False, ent=False, alpha=False, lex=False, file_path='log.txt'): print('Features utilizadas: \n') print('NGRAM: ' + str(n_gram) + '\n') print('tags: ' + str(tags) + '\n') print('pos: ' + str(pos) + '\n') print('dep: ' + str(dep) + '\n') print('stem: ' + str(stem) + '\n') print('ent: ' + str(ent) + '\n') print('alpha: ' + str(alpha) + '\n') print('Remove stopwords: ' + str(remove_stop_words) + '\n') print('Remove ponctuation: ' + str(remove_punct) + '\n\n') print('Processando texto...') processor = Preprocessor() train_text = processor.process_dataset(train_text, n_gram=n_gram, stem=stem, tags=tags, remove_stop_words=remove_stop_words, remove_punct=remove_punct, pos=pos, dep=dep, alpha=alpha, vectorizer='count', lex=lex) ## TREINANDO NAIVE ## print('Treinando modelo...') clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) # clf3 = SGDClassifier(loss='hinge', penalty='l2', # alpha=1e-3, random_state=42, # max_iter=7, tol=None) clf3 = MultinomialNB() clf4 = SVC(C=100, gamma=5e-05, kernel='rbf') text_clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('mnb', clf3), ('svm', clf4)], voting='hard') file = open(file_path, 'a') file.write('Features utilizadas: \n') file.write('NGRAM: ' + str(n_gram) + '\n') file.write('pos: ' + str(pos) + '\n') file.write('dep: ' + str(dep) + '\n') file.write('tags: ' + str(tags) + '\n') file.write('stem: ' + str(stem) + '\n') file.write('ent: ' + str(ent) + '\n') file.write('alpha: ' + str(alpha) + '\n') file.write('Remove stopwords: ' + str(remove_stop_words) + '\n') file.write('Remove ponctuation: ' + str(remove_punct) + '\n\n') kf = KFold(n_splits=10) f1 = [] precision = [] recall = [] for train_index, test_index in kf.split(train_text): # print('Kfold train_index: ', train_index, '\ntest_index: ', test_index) X_train, X_test = extract_indexes(train_text, train_index), extract_indexes( train_text, test_index) y_train, y_test = extract_indexes(train_target, train_index), extract_indexes( train_target, test_index) print(' train target ', extract_indexes(train_target, train_index)) print(' test target ', extract_indexes(train_target, test_index)) text_clf.fit(X_train, y_train) y_pred = text_clf.predict(X_test) print(confusion_matrix(y_test, y_pred)) print( metrics.classification_report(y_test, y_pred, target_names=categories)) file.write( metrics.classification_report(y_test, y_pred, target_names=categories)) precision.append(metrics.precision_score(y_test, y_pred)) recall.append(metrics.recall_score(y_test, y_pred)) f1.append(metrics.f1_score(y_test, y_pred)) f1 = np.array(f1) precision = np.array(precision) recall = np.array(recall) f1_mean = f1.mean() precision_mean = precision.mean() recall_mean = recall.mean() f1_std = f1.std() precision_std = precision.std() recall_std = recall.std() print('Escrevendo arquivo de log\n') file.write('Recall Macro: ' + str(recall_mean) + ' (+/-) ' + str(recall_std * 2) + '\n') file.write('Precision Macro: ' + str(precision_mean) + ' (+/-) ' + str(precision_std * 2) + '\n') file.write('F1 Macro: ' + str(f1_mean) + ' (+/-) ' + str(f1_std * 2) + '\n') file.write('\n\n#############################################\n\n') file.close()
def SVM(): #s = os.listdir(pathAttributes.data) file_path_data = os.path.join(pathAttributes.data, "*.csv") list_of_file = glob.glob(file_path_data) latest_data = max( list_of_file, key=os.path.getctime ) #-1 is idname.txt because number always bigger than charact print(latest_data) file_path = os.path.join(pathAttributes.data, latest_data) df = pd.read_csv(file_path, header=None) df.replace('?', -99999, inplace=True) X = np.array(df[df.columns[1:129]]) X.reshape(-1, 1) y = np.array(df[df.columns[0]]) #print(X) #scaler = StandardScaler() #X = scaler.fit_transform(X.astype(np.float64)) """ to-do-list 1.scaled inputs 2.cross_validation 3.grid search """ X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2, random_state=42) try: clt = joblib.load(pathAttributes.SVM_model) #backup the current correct model su.copy(pathAttributes.SVM_model, pathAttributes.backup) except: #clt = SGDClassifier(loss="hinge",penalty="l2", random_state=42, warm_start=True) rnd_clt = RandomForestClassifier(n_estimators=867, max_leaf_nodes=6) svc_clt = svm.SVC(kernel="linear", C=1.6, probability=True) knn_clt = KNeighborsClassifier() clt = VotingClassifier(estimators=[('rc', rnd_clt), ('kc', knn_clt), ('sc', svc_clt)], voting='soft') #clt = svm.LinearSVC(penalty='l2', loss="hinge", C=1.6) #randomizedsearchCV for randomforestclassifier and knn """ param = { 'n_estimators':randint(low=1,high=1000), 'max_leaf_nodes':[6,None], } rnd_search = RandomizedSearchCV(clt, param_distributions=param, cv=3, scoring='accuracy') rnd_search.fit(X_train,y_train) print(rnd_search.best_params_,rnd_search.best_score_) print(rnd_search.cv_results_) """ """ param = [ {'n_neighbors':[1,3,5,7,9]}, ] rnd_search = GridSearchCV(clt, param, cv=3, scoring='accuracy') rnd_search.fit(X_train,y_train) print(rnd_search.best_params_,rnd_search.best_score_) print(rnd_search.cv_results_) """ a = cross_val_predict(clt, X_train, y_train, cv=3) b = cross_val_score(clt, X_train, y_train, cv=3) print(confusion_matrix(y_train, a), b) total = len(df.index) print(total) chunk_size = 1000 epochs = 1000 """ classes = [] for i in clt.classes_: classes.append(i) for i in np.unique(y): flag = False for j in classes: if j == i: flag = True if not flag: classes.append(i) chunks = int(total/chunk_size)+1 for epoch in range(epochs): for chunk in range(chunks): starter = chunk * chunk_size if starter+chunk_size > total: clt.partial_fit(X_train[starter:total+1],y_train[starter:total+1],classes=classes) #clt.fit(X_train[starter:total+1],y_train[starter:total+1]) else: clt.partial_fit(X.train[starter:starter+chunk_size],y_train[starter:starter+chunk_size],classes=np.unique(y)) #clt.fit(X_train[starter:total+1],y_train[starter:chunk_size]) """ clt.fit(X_train, y_train) indecs = np.random.permutation(int(total / 3)) X_validation = X_train[indecs] y_predict = clt.predict(X_test) print(confusion_matrix(y_test, y_predict)) if clt.score(X_test, y_test) > 0.96: joblib.dump(clt, pathAttributes.SVM_model)
MNB_pipeline = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2))), ('clf', MultinomialNB(alpha = 1.0, fit_prior = True)), ]) KNN_pipeline = Pipeline([('vect', CountVectorizer()), ('clf', KNeighborsClassifier(n_neighbors = 20)), ]) SGD_pipeline = Pipeline([('vect', CountVectorizer()), ('clf', linear_model.SGDClassifier(loss='log')), ]) LR_pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(norm = 'l2', use_idf = True, smooth_idf = True, sublinear_tf = True)), ('clf', LogisticRegression(warm_start = True, random_state = 1)), ]) eclf = VotingClassifier(estimators=[('MNB', MNB_pipeline), ('SGD',SGD_pipeline), ('LR', LR_pipeline)], voting = 'soft', weights = [3,2,3]) #('KNN', KNN_pipeline), eclf.fit(rev_train,labels_train) #use soft voting to predict (majority voting) pred=eclf.predict(rev_test) for x in pred: fileWriter.write(str(x)+'\n') fileWriter.close()
class Brain(object): """ The Brain object Holds sklrean classifiers and makes it simpler to train using a dataframe """ def __init__(self, lobes=False): """ lobes = a dict of classifiers to use in the VotingClassifier defaults to RandomForestClassifier and DecisionTreeClassifier """ if isString(lobes): try: self.load(lobes.split('.pickle')[0]) except Exception as e: logger.exception(e) lobes = False if not lobes: lobes = {'rf': RandomForestClassifier(n_estimators=7, random_state=666), 'dt': DecisionTreeClassifier() } self.lobe = VotingClassifier( estimators=[(lobe, lobes[lobe]) for lobe in lobes], voting='hard', n_jobs=-1) self._trained = False def train(self, df, shuffle=True, preprocess=False, *args, **kwargs): """ Takes a dataframe of features + a 'label' column and trains the lobe """ if self._trained: logger.warning('Overwriting an already trained brain!') self._trained = False # shuffle data for good luck if shuffle: df = shuffleDataFrame(df) # scale train data and fit lobe x = df.drop('label', axis=1).values y = df['label'].values del df if preprocess: x = preprocessing.scale(x) logger.info('Training with %d samples', len(x)) self.lobe.fit(x, y) self._trained = True def predict(self, df): """ Get a prediction from the votingLobe """ return self.lobe.predict(prepDataframe(df).values) def score(self, df, test='predict'): """ Get a prediction score from the votingLobe """ df = prepDataframe(df) return accuracy_score(df[test].values, df['label'].values) def save(self, location="brain"): """ Pickle the brain """ if self._trained: joblib.dump(self.lobe, location + ".pickle") logger.info('Brain %s saved', location + '.pickle') else: return logger.error('Brain is not trained yet! Nothing to save...') def load(self, location="brain"): """ Loads a brain pickle """ logger.info('Loading saved brain %s', location + '.pickle') self.lobe = joblib.load(location + ".pickle") self._trained = True
rfClf = RandomForestClassifier(n_estimators=500, random_state=0) # 500 trees. svmClf = SVC(probability=True, random_state=0) # probability calculation logClf = LogisticRegression(random_state=0) nbclf = GaussianNB() # constructing the ensemble classifier by mentioning the individual classifiers. clf2 = VotingClassifier(estimators=[('rf', rfClf), ('svm', svmClf), ('log', logClf), ('nb', nbclf)], voting='soft') # train the ensemble classifier clf2.fit(X_train, y_train) from sklearn.metrics import precision_score, accuracy_score x_actual, x_pred = y_train, clf2.predict(X_train) precision_score_VC_train = precision_score(x_actual, x_pred) accuracy_score_VC_train = accuracy_score(x_actual, x_pred) print('The precision score of Voting classifier on TRAIN is : ', round(precision_score_VC_train * 100, 2), '%') print('The accuracy score of Voting classifier on TRAIN is : ', round(accuracy_score_VC_train * 100, 2), '%') from sklearn.metrics import precision_score, accuracy_score y_actual, y_pred = y_test, clf2.predict(X_test) precision_score_VC_test = precision_score(y_actual, y_pred) accuracy_score_VC_test = accuracy_score(y_actual, y_pred) print('The precision score of Voting classifier on Test is : ', round(precision_score_VC_test * 100, 2), '%') print('The accuracy score of Voting classifier on Test is : ', round(accuracy_score_VC_test * 100, 2), '%')
Ypred = np.zeros(Y.shape, dtype='object') print 'Classification using Ensemble' for train_index, test_index in sss: print "Iter", itr, X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] clf1 = KNeighborsClassifierknn = KNeighborsClassifier(n_neighbors=5, weights= 'distance', metric='manhattan') clf2 = RandomForestClassifier(n_estimators=300, max_depth=30, bootstrap=False, class_weight="balanced", min_samples_split = 10) #clf3 = tree.DecisionTreeClassifier(max_depth=10, splitter='best', min_samples_split=81) clf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft') clf = clf.fit(X_train, y_train) Ypred[test_index] = clf.predict(X_test) result = clf.predict(X_train) tr_acc = float(np.sum(y_train==result))/float(y_train.shape[0]) accuracy = float(np.sum(y_test==Ypred[test_index]))/float(y_test.shape[0]) print " => Train Accuracy = %.4f, Accuracy = %.4f" % (tr_acc, accuracy) itr += 1 accuracy = float(np.sum(Y==Ypred))/float(Y.shape[0]) print "=== Total accuracy = ", accuracy, ' ===' print '' print clf cm = confusion_matrix(Y, Ypred, labels=classes) print cm print clf
svm = SVC(probability=True, C=3) svm.fit(X_train, y_train) knn = KNeighborsClassifier(metric='minkowski', n_neighbors=70, p=1, weights='distance') knn.fit(X_train, y_train) grad = GradientBoostingClassifier(learning_rate=0.1, max_depth=2, max_features=2, n_estimators=400, subsample=1.0) grad.fit(X_train, y_train) fi = pd.DataFrame({'feature': X.columns.values, 'importance': grad.feature_importances_}) plotting.p12(fi) estimators = [('knn', knn), ('svm', svm), ('grad', grad)] eclf = VotingClassifier(estimators=estimators, voting='soft') eclf.fit(X_train, y_train) pred_eclf = eclf.predict(X_test) # params = [{'svm__C': range(1,50) # }, # {'grad__n_estimators': [400, 500, 700, 1000], # 'grad__max_depth': range(1,6), # 'grad__subsample': [0.2, 0.6, 1.0], # 'grad__learning_rate': [0.1, 0.5, 1.0], # 'grad__max_features': [1, 2, 4, 'auto', 'log2', None] # }, # {'knn__n_neighbors': [3, 5, 10, 20, 40], # 'knn__p': range(1,7) # }] # params = {'n_estimators': [400, 500, 700, 1000], # 'max_depth': range(1,6), # 'subsample': [0.2, 0.6, 1.0], # 'learning_rate': [0.1, 0.5, 1.0],
print("xgb cross validation f1-score = ", cross_val_score(xgb, X_train, y_train, cv=5, scoring="f1_micro").mean() ) #xgboost with full train set (all features) print("mlp cross validation f1-score = ", cross_val_score(mlp, X_train, y_train, cv=5, scoring="f1_micro").mean()) #initialize ensembles estimators = [] estimators.append(('mlp', mlp)) estimators.append(('rf', rf)) estimators.append(('xgb', xgb)) #voting ensemlbe ensemble = VotingClassifier(estimators, voting='soft', weights=[1, 1, 1]) ensemble.fit(X_train, y_train) pred = ensemble.predict(X_test) print('fscore:{0:.3f}'.format(f1_score(y_test, pred, average='micro'))) #meta classifier ensemble stack = StackingCVClassifier(classifiers=[mlp, xgb, rf], cv=2, meta_classifier=lr, use_probas=True) stack.fit(X_train.values, y_train.values) pred2 = stack.predict(X_test.values) print('fscore:{0:.3f}'.format(f1_score(y_test, pred2, average='micro'))) from sklearn.metrics import confusion_matrix confusion_lr = confusion_matrix(y_test, pred) print(confusion_lr)
#for each outcome file, get training and test data to build a model and predict outcomes for o in outcome_list: info,outcome=loadData('Outcomes' + '/' + o +'.txt') #split data into training and test datasets train, test, labels_train, labels_test = train_test_split(info, outcome, test_size=0.33) counter = CountVectorizer() counter.fit(train) #count the number of times each term appears in a document and transform each doc into a count vector counts_train = counter.transform(train)#transform the training data counts_test = counter.transform(test)#transform the testing data #build a classifier on the training data using LR and NB clf1 = LogisticRegression() clf2 = MultinomialNB() #build a voting classifer - give logistic regression twice as much weight eclf = VotingClassifier(estimators=[('lr', clf1), ('mnb', clf2)], voting='soft', weights = [2,1]) #train all classifier on the same datasets eclf.fit(counts_train,labels_train) #use hard voting to predict (majority voting) predicted=eclf.predict(counts_test) #print the accuracy print 'Accuracy of', o, 'prediction: ', accuracy_score(predicted,labels_test)
bn_pred = bnb.fit(Imdb_train_vectors.toarray(), Imdb_train_labels).predict(Imdb_test_vectors.toarray()) print "Naive Bayes F1 Score on IMDB dataset: ", f1_score(Imdb_test_labels, bn_pred, average='macro') print "Naive Bayes F1 Score on UCI dataset: ", UCI_scores.mean() print "\n" #print "Classification Report for Naive Bayes on IMDB dataset:\n", classification_report(Imdb_test_labels, y_pred) ###################### Voting Classifier ###################### voting_clf = VotingClassifier(estimators=[('nb', bnb), ('lg1', logistic_reg), ('svc', classifier_liblinear), ('mlp', ML_perceptron)], voting='hard', weights=[1, 1, 1, 1]) # voting_clf = VotingClassifier(estimators=[('nb', bnb), ('lg1', logistic_reg), ('svc', classifier_liblinear)], # voting='hard', weights=[1,1,1]) UCI_scores = cross_val_score(voting_clf, UCI_train_vectors, UCI_train_labels, cv=10, scoring='f1_macro') voting_clf.fit(Imdb_train_vectors, Imdb_train_labels) voting_clf_pred = voting_clf.predict(Imdb_test_vectors) print "Voting Classifier F1 Score on IMDB dataset: ", f1_score( Imdb_test_labels, voting_clf_pred, average='macro') print "Voting Classifier F1 Score on UCI dataset: ", UCI_scores.mean() #print "Classification Report for Voting Classifier on IMDB dataset:\n", classification_report(Imdb_test_labels, voting_clf)
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'): #NOTE we might not need xtltrain # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength # ytest is optional and depends on if you are using a testing set or the practice set # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget) # print 'finished removal of Nans' ytrain = np.ravel(ytrain) ytarget = np.ravel(ytarget) #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] targetStringMat = [] targets1 = [] predictions1 = [] # svc1 = SVC() # svc1.fit(xtrain,ytrain) # ytest = svc1.predict(xtest) # predictionMat[:,count] = ytest # count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 # print xtltest # print len(ytest) for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] if testing: modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0) else: modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0) ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0) if testing: modeStr = temppredVec2Str(modeCol,grids) else: modeStr = predVec2Str(modeCol) modeStrans = predVec2Str(ytarg) predictionStringMat.append(modeStr) predictions1.append(modeCol) finalPredMat += map(int,modeCol) targetStringMat.append(modeStrans) targets1.append(ytarg) if testing == False: if ytarget != None: #print targets1 #print "" #print predictions1 confusionme = confusion_matrix(targets1[0],predictions1[0]) #print "Confusion Matrix is: " #print confusionme return predictionStringMat, targetStringMat, finalPredMat
#save best model etc_best = etc_gs.best_estimator_ #check best n_estimators value print('etc: ',etc_gs.best_params_, etc_gs.best_score_) #Voting classifier from sklearn.ensemble import VotingClassifier #create a dictionary of our models estimators=[('rf', rf_best),('knc',knc_best), ('log_reg', log_reg), ('etc',etc_best), ('gbc', gbc_best), ('SVC',svc_best),('ADC',adc_best),('xgb',xgb_best)] #create our voting classifier, inputting our models ensemble = VotingClassifier(estimators, voting='soft') #fit model to training data ensemble.fit(X_train, y_train) #test our model on the test data print(ensemble.score(X_test, y_test)) submission = pd.DataFrame() prediction = pd.DataFrame(ensemble.predict(test_cleaned.loc[:,'CabinA':].values)) submission['Survived'] = prediction[0] submission['PassengerId']=test['PassengerId'] submission.to_csv('submission.csv',index = False)