def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def votingEnsembleTest2ndLayer_Test(top_ensembles_dict, test_country_data): hit_count = 0 for BC in top_ensembles_dict.keys(): classifiers = [ _vclf for _vclf in [sub_list[1] for sub_list in top_ensembles_dict[BC]] ] _weights = np.asarray([1] * len(classifiers)) vclf_layer2 = EnsembleVoteClassifier(clfs=classifiers, weights=_weights, refit=False) Y = test_country_data[BC]["Y"] X = test_country_data[BC]["X"] vclf_layer2.fit(X, Y) y_estimate = vclf_layer2.predict(X) print( "Mentality Cycle {} 2nd Layer Voting Classifier Ensemble has accuracy: {}" .format(BC, np.mean(Y == y_estimate))) hit_count = hit_count + np.sum( Y == y_estimate ) ##calc overall performance of top 3 classifiers for each region total_obvs = test_country_data[1]["Y"].shape[0] + test_country_data[2][ "Y"].shape[0] + test_country_data[3]["Y"].shape[0] overall_hit_rate = hit_count / total_obvs print("Aggregated accuracy of 2nd Layer Voting Classifiers is: {}".format( overall_hit_rate))
def test_no_weight_support_with_no_weight(): logi = LogisticRegression() rf = RandomForestClassifier() gnb = GaussianNB() knn = KNeighborsClassifier() eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard') eclf.fit(X, y)
def test_no_weight_support_with_no_weight(): logi = LogisticRegression(solver='liblinear', multi_class='ovr') rf = RandomForestClassifier(n_estimators=10) gnb = GaussianNB() knn = KNeighborsClassifier() eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard') eclf.fit(X, y)
def emsembal_train(feature, label): from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier label = transport_labels(label) X_train, X_test, Y_train, Y_test = train_test_split(feature, label, test_size=0.2, random_state=1000) clf1 = SVC(C=10, kernel='sigmoid', probability=True) clf2 = RandomForestClassifier(random_state=0) clf3 = LogisticRegression(random_state=0) clf4 = xgb.XGBClassifier(max_depth=8, learning_rate=0.07, n_estimators=35, silent=True, objective="binary:logistic", booster='gbtree', gamma=0, min_child_weight=6, subsample=0.8, colsample_bytree=0.7, reg_alpha=0.1, seed=1000) eclf = EnsembleVoteClassifier(clfs=[clf1, clf3, clf4], voting='soft') eclf.fit(X_train, Y_train) y_pred = eclf.predict(X_test) print('eclf accs=%f' % (sum(1 for i in range(len(y_pred)) if y_pred[i] == Y_test[i]) / float(len(y_pred))))
def tri_train(domain,X_train,y_train,X_test,y_test,X_un,theta=0.5,dis=False): models = list() accs = list() for i in range(3): X_split,y_split = bootstrap_sample(X_train,y_train) acc,clf_func = get_acc_clf(domain,X_split,y_split,X_test,y_test) models.append(clf_func) accs.append(acc) for (j,k) in itertools.combinations(models,2): # i_features = list() unlabelled_features = np.array(X_un) total = len(X_train)+len(X_un) t = 0 count = 0 X_i = X_train y_i = y_train # find current classifier clf_i = [x for x in models if x!=j and x!=k][0] index_i = models.index(clf_i) print "***classifier %d***"%index_i while count < total and len(unlabelled_features)!=0: t += 1 X_tgt,y_tgt = get_features(unlabelled_features,j,k,clf_i,models,theta=theta,dis=dis) if len(X_tgt)==0 and t>1: print "no new features added" break X_i = concatenate(X_i,X_tgt) y_i = concatenate(y_i,y_tgt) count = len(X_i) print "%d %d %d"%(t,count,total) # clf_i.fit(X_i,y_i) # update classifier acc,clf_i = get_acc_clf(domain,X_i,y_i,X_test,y_test) if accs[index_i]<acc: accs[index_i] = acc # best_clf = clf_i print "*NEW BEST! best acc:", acc models[index_i] = clf_i else: print "no improvement..skip.." break if count == total: print "reach end.." break # update the unlabelled features for speed-up print np.array(X_tgt).shape X_tgt = [list(x) for x in X_tgt] unlabelled_features =[x for x in unlabelled_features if list(x) not in X_tgt] print np.array(unlabelled_features).shape # majority vote classifiers eclf = EnsembleVoteClassifier(clfs=models,weights=[1,1,1],refit=False) eclf.fit(X_test,y_test) # this line is not doing work # tmp_name = domain.upper()[0] if "large" not in domain else "large/"+domain.upper()[6] pred = eclf.predict(X_test) acc = accuracy_score(y_test,pred) if "large" not in domain else f1_score(y_test,pred,average='macro') print "acc:%s theta:%s"%(acc,theta),"seprate accs:",accs return acc,eclf
def make_model(self): #--------------------------------------------------------------------------------------------- # TREE BASED ALGORITHMS #--------------------------------------------------------------------------------------------- #--Chossing random_state parameter #------Basically, a sub-optimal greedy algorithm is repeated a number of times using---------- #------random selections of features and samples (a similar technique used in random---------- #------ forests).The 'random_state' parameter allows controlling these random choices--------- #--n_estimators = no of decision trees to be created in forest model_rf = RandomForestClassifier(n_estimators=145, random_state=10, n_jobs=-1) model_rf.fit(train_feats2, target) model_gb = GradientBoostingClassifier(n_estimators=145, random_state=11, n_jobs=-1) model_gb.fit(train_feats2, target) model_ab = AdaBoostClassifier(n_estimators=145, random_state=12, n_jobs=-1) model_ab.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # LOGISTIC REGRESSION #-------------------------------------------------------------------------------------------- model_lr = LogisticRegression(random_state=1) model_lr.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # NAIVE BAYES #-------------------------------------------------------------------------------------------- model_nb = MultinomialNB() model_nb.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # VOTING ENSEMBLE OF ALL MODELS #-------------------------------------------------------------------------------------------- clf = [model_rf, model_lr, model_gb, model_ab, model_nb] eclf = EnsembleVoteClassifier( clfs=clf, weights=[1, 2, 1, 1, 1], refit=False) #weights can be decided by stacking!! eclf.fit(train_feats2, target) print("model created") preds = eclf.predict(test_feats2) sub3 = pd.DataFrame({'User_ID': test_df.User_ID, 'Is_Response': preds}) sub3['Is_Response'] = sub3['Is_Response'].map( lambda x: functions.to_labels(self, x)) sub3 = sub3[['User_ID', 'Is_Response']] sub3.to_csv('D:\\New folder\\f2c2f440-8-dataset_he\\SUB_TEST.csv', index=False) print("prediction saved") return eclf
def majority_vote(target): X_test = load_obj("%s/X_test"%target) y_test = load_obj("%s/y_test"%target) domains = [] if "mlp" in target: domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"] else: if "large" not in target: domains = ["books","dvd","electronics","kitchen"] if target not in domains: return else: domains =["large/baby","large/cell_phone","large/imdb","large/yelp2014"] models = [] for source in domains: if target == source: continue else: print source clf_func = load_obj("%s/self_clf"%source) models.append(clf_func) eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1], eclf.fit(X_test,y_test) # this line is not doing work tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6] tmp_name = target.upper()[0] if "mlp" not in target else "mlp/"+target.upper()[4] save_obj(eclf, '%s_eclf'%(tmp_name)) pred = eclf.predict(X_test) acc = accuracy_score(y_test,pred) if "large" not in target else f1_score(y_test,pred,average='macro') print 'self-train',acc pass
def __init__(self, X, y, x_test, model_lists): self.model = EnsembleVoteClassifier(clfs=model_lists, weights=[1, 1, 1], refit=False, voting='soft') self.X = X self.y = y self.X_test = x_test
def train_knn_model(assts, n_macroepochs=100, n_epochs=10): TUNE = False #we start by fitting pca across the whole population (random sample) sgen = xy_generator(assts, batch_size=5000) pca = PCA(n_components=48) for _,X,y,_,_,_,_ in sgen: print("fitting PCA...") X = numpy.array(X, dtype=numpy.int8) y = numpy.array(y).ravel() pca.fit_transform(X) # if TUNE: # tuned_parameters = [{'n_neighbors': [1, 20, 50, 100], # 'weights': ['distance', 'uniform'], # 'algorithm': ['ball_tree', 'kd_tree', 'brute'] # }] # scores = ['f1_macro', 'f1_micro', 'accuracy'] # # scores = ['accuracy'] # performances = [] # print("Tuning") # for score in scores: # print("# Tuning hyper-parameters for %s" % score) # clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5, scoring=score, verbose=0, n_jobs=7) # clf.fit(X, y) # print("Best parameters set found on development set:") # print(clf.best_estimator_) # print("Grid scores on development set:") # for params, mean_score, scores in clf.grid_scores_: # print("%0.3f (+/-%0.03f) for %r" # % (mean_score, scores.std() / 2, params)) # # break #half-loop just to get one sample from sgen exit() del sgen print("fitted") gc.collect() xygen = xy_generator(assts, batch_size=5000) # make generator object clfs = [] i = 0 for S,X, y, yc, yt, ylv, yv in xygen: X = numpy.array(X, dtype=numpy.int8) y = numpy.array(y) X = pca.transform(X) voter = SVC() voter.fit(X,y) clfs.append(voter) i += 1 model = EnsembleVoteClassifier(clfs=clfs, refit=False) X_for_classes = [] y_for_classes = [] for classlabel in all_page_ids: X_for_classes.append(numpy.zeros(256)) y_for_classes.append(classlabel) model.fit(X_for_classes,y_for_classes) return model, pca, None, None #, sscaler, levscaler, volscaler
def test_no_weight_support(): random.seed(87) w = np.array([random.random() for _ in range(len(y))]) logi = LogisticRegression(solver='liblinear', multi_class='ovr') rf = RandomForestClassifier(n_estimators=10) gnb = GaussianNB() knn = KNeighborsClassifier() eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard') eclf.fit(X, y, sample_weight=w)
def _fit_wmv(self): #Merge the base learners and the produced models(extra) models = self.bootstrap_models.values() #Define the Weighted Majority Voting model self.wmv_model = EnsembleVoteClassifier(clfs=models, weights=self.weights, voting=self.voting, refit=False) #Fit the WMV model self.wmv_model.fit(self.dataset.X_train, self.dataset.y_train)
def ensemble(self, folds_limit=42): answers = [] pass # clf1 = ExtraTreesClassifier(max_features=0.4, min_samples_leaf=1, min_samples_split=4, # n_estimators=1000, n_jobs=self.cpu) # clf2 = ExtraTreesClassifier(criterion="gini", max_features=0.4, min_samples_split=6, n_estimators=1000, # n_jobs=self.cpu) # clf3 = ExtraTreesClassifier(max_features=0.55, min_samples_leaf=1, min_samples_split=4, n_estimators=1000, # n_jobs=self.cpu) # clf4 = ExtraTreesClassifier(max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=1000, # n_jobs=self.cpu) pass # default 0.6742 on seed=42 for full set (search_best_3) clf1 = ExtraTreesClassifier(max_features=0.4537270875668709, criterion='entropy', min_samples_leaf=1, min_samples_split=2, n_estimators=3138, n_jobs=self.cpu) pass # clf1 = RandomForestClassifier(max_features=0.34808889858456293, criterion='entropy', # min_samples_split=2, n_estimators=4401, n_jobs=self.cpu) pass # default # clf1 = ExtraTreesClassifier(max_features=0.4, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, # n_jobs=self.cpu) self.pipeline = EnsembleVoteClassifier(clfs=[clf1], weights=[1], voting='soft') for iteration in range(folds_limit): np.random.seed(42 + iteration) x_train, y_train, x_test, y_test = self.get_fold( self.default_columns) self.pipeline.fit(x_train, y_train) preds = self.pipeline.predict(x_test) # print(confusion_matrix(y_test, preds)) matrix_ = confusion_matrix(y_test, preds) correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][ 2] + matrix_[3][3] + matrix_[4][4] print(' Correct answers count: ', correct_answers, ' [it: %s]' % iteration) answers.append(int(correct_answers)) if iteration % 5 == 0 and iteration > 0: print('Params: mean: %s std: %s best: %s' % (np.mean(answers), np.std(answers), max(answers))) print('Params: mean: %s std: %s best: %s' % (np.mean(answers), np.std(answers), max(answers)))
def test_sample_weight(): # with no weight np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard') prob1 = eclf.fit(X, y).predict_proba(X) # with weight = 1 w = np.ones(len(y)) np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard') prob2 = eclf.fit(X, y, sample_weight=w).predict_proba(X) # with random weight random.seed(87) w = np.array([random.random() for _ in range(len(y))]) np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard') prob3 = eclf.fit(X, y, sample_weight=w).predict_proba(X) diff12 = np.max(np.abs(prob1 - prob2)) diff23 = np.max(np.abs(prob2 - prob3)) assert diff12 < 1e-3, "max diff is %.4f" % diff12 assert diff23 > 1e-3, "max diff is %.4f" % diff23
def test_1model_probas(): clf = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=123) ens_clf_1 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=None) ens_clf_2 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=[1.]) pred_e1 = ens_clf_1.fit(X, y).predict_proba(X) pred_e2 = ens_clf_2.fit(X, y).predict_proba(X) pred_e3 = clf.fit(X, y).predict_proba(X) np.testing.assert_almost_equal(pred_e1, pred_e2, decimal=8) np.testing.assert_almost_equal(pred_e1, pred_e3, decimal=8)
def test_get_params(): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1, n_estimators=10) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3]) got = sorted(list({s.split('__')[0] for s in eclf.get_params().keys()})) expect = [ 'clfs', 'gaussiannb', 'kneighborsclassifier', 'randomforestclassifier', 'refit', 'verbose', 'voting', 'weights' ] assert got == expect, got
def majority_vote_mlp(target): X_test = load_obj("%s/X_test"%target) y_test = load_obj("%s/y_test"%target) # domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"] data_name = ["books", "dvd", "electronics", "kitchen"] X_joint = load_obj("%s/X_joint"%target) y_joint = load_obj("%s/y_joint"%target) temp_un = load_obj("%s/X_un"%target) meta_sources = [] for i in range(len(data_name)): if 'mlp/'+data_name[i] != target: meta_sources.append(data_name[i]) # print meta_sources models = [] for j in range(len(meta_sources)): temp_X = X_joint[j] temp_y = y_joint[j] thetas = [0.5,0.6,0.7,0.8,0.9] best_acc = 0.0 best_clf ="" best_theta = 0.0 resFile = open("../work/params/%s_theta_self-%s.csv"%(target,meta_sources[j].upper()[0]),"w") resFile.write("theta, acc\n") for theta in thetas: print "##############################" print "start with theta=%s"%theta print "##############################" acc,clf_func = self_train(target,temp_X,temp_y,X_test,y_test,temp_un,theta=theta) if best_acc<acc: best_acc = acc best_clf = clf_func best_theta = theta resFile.write("%f, %f\n"%(theta,acc)) resFile.flush() resFile.close() print "##############################" print "best_theta:",best_theta,"best_acc:",best_acc models.append(best_clf) eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1], eclf.fit(X_test,y_test) # this line is not doing work # tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6] # tmp_name = 'mlp/'+target.upper()[4] save_obj(eclf, "%s/self_clf"%target) pred = eclf.predict(X_test) # print pred acc = accuracy_score(y_test,pred) print 'self-train',acc pass
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def votingEnsembleTest(all_country_data_with_algos, test_country_data_US): print( " \n For each training set country for each sub dataset (split by Mentality Cycle): the top n trained algorithms form a Voting Classifiers. This Voting Classifiers is then tested on its corresponding US sub data set. An aggregate scocre for each trainging set country is calculated through an Aggregation of its 3 Voting Classifiers' performances" ) _all_country_data_with_trained_algos = copy.deepcopy( all_country_data_with_algos) for country in _all_country_data_with_trained_algos.keys(): country_level_total_hits = 0 for BC in _all_country_data_with_trained_algos[country].keys(): classifiers = copy.deepcopy( _all_country_data_with_trained_algos[country][BC].get( 'trained algos')) clf_weights = np.asarray([1, 1, 1], dtype=int) Y = test_country_data_US[BC].get("Y") X = test_country_data_US[BC].get("X") vclf = EnsembleVoteClassifier(clfs=classifiers, weights=clf_weights, refit=False, voting='hard') # voting='soft' vclf.fit(X, Y) y_estimate = vclf.predict(np.array(X)) print( "Voting Classifier trained on {} Mentality Cycle {} has accuracy: {}" .format(country, BC, np.mean(Y == pd.Series(y_estimate)))) ##saving Country-BC split accuracy and instance of Voting Classifier score to all_country... dictionary _all_country_data_with_trained_algos[country][BC][ 'accuracy'] = np.mean(Y == y_estimate) _all_country_data_with_trained_algos[country][BC][ 'votingclassifier'] = vclf country_level_total_hits = country_level_total_hits + np.sum( Y == y_estimate) record_count = test_country_data_US[1]["Y"].shape[ 0] + test_country_data_US[2]["Y"].shape[0] + test_country_data_US[ 3]["Y"].shape[0] _all_country_data_with_trained_algos[country]['accuracy'] = ( country_level_total_hits / record_count) print("Aggregated Classifier trained on {} has accuracy: {} \n".format( country, _all_country_data_with_trained_algos[country]['accuracy'])) return _all_country_data_with_trained_algos
def test_get_params(): clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3]) got = sorted(list({s.split('__')[0] for s in eclf.get_params().keys()})) expect = ['clfs', 'gaussiannb', 'kneighborsclassifier', 'randomforestclassifier', 'refit', 'verbose', 'voting', 'weights'] assert got == expect, got
def test4(): # Example 2 - Grid Search from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import EnsembleVoteClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft') params = { 'logisticregression__C': [1.0, 100.0], 'randomforestclassifier__n_estimators': [20, 200], } grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) grid.fit(iris.data, iris.target) cv_keys = ('mean_test_score', 'std_test_score', 'params') for r, _ in enumerate(grid.cv_results_['mean_test_score']): print( "%0.3f +/- %0.2f %r" % (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] / 2.0, grid.cv_results_[cv_keys[2]][r]))
def test5(): from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import EnsembleVoteClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) eclf = EnsembleVoteClassifier(clfs=[clf1, clf1, clf2], voting='soft') # If the EnsembleClassifier is initialized with multiple similar estimator objects, the estimator names are modified with consecutive integer indices, for example: params = { 'logisticregression-1__C': [1.0, 100.0], 'logisticregression-2__C': [1.0, 100.0], 'randomforestclassifier__n_estimators': [20, 200], } grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) grid = grid.fit(iris.data, iris.target) cv_keys = ('mean_test_score', 'std_test_score', 'params') for r, _ in enumerate(grid.cv_results_['mean_test_score']): print( "%0.3f +/- %0.2f %r" % (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] / 2.0, grid.cv_results_[cv_keys[2]][r]))
def test_fit_base_estimators_false(): np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() clf1.fit(X, y) clf2.fit(X, y) clf3.fit(X, y) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', fit_base_estimators=False) eclf.fit(X, y) assert round(eclf.score(X, y), 2) == 0.97
def ensembleClassifier(X_train, X_test, y_train, y_test, X_1_df, Y_1_df): print('-----------------------------') print('Ensemble Vote Classifier was Called. Wait...') clf1 = LogisticRegression(C=5.0, class_weight='balanced', max_iter=10000, random_state=1) # C = 5.0 clf2 = SVC(kernel='linear', C=1.0, random_state=1) # linear SVM C = 1.0 clf3 = KNeighborsClassifier(n_neighbors=1) # optimum_k = 1 clf4 = DecisionTreeClassifier(max_depth=23, criterion='gini') # labels = [ 'Logistic Regression', 'Support Vector Machine', 'K Nearest Neighbor', 'Decision Tree', 'Ensemble' ] start = time.time() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[1, 1, 1, 1]) for clf, label in zip([clf1, clf2, clf3, clf4, eclf], labels): clf.fit(X_1_df, Y_1_df) scores = cross_val_score(clf, X_1_df, Y_1_df.values.ravel(), cv=20, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) end = time.time() print("Running time %.3f" % (end - start)) return
def test_use_clones(): np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], use_clones=True).fit(X, y) assert_raises( exceptions.NotFittedError, "This RandomForestClassifier instance is not fitted yet." " Call 'fit' with appropriate arguments" " before using this estimator.", clf2.predict, X) EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], use_clones=False).fit(X, y) clf2.predict(X)
def test_EnsembleVoteClassifier_gridsearch(): clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft') params = { 'logisticregression__C': [1.0, 100.0], 'randomforestclassifier__n_estimators': [20, 200] } grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) grid.fit(iris.data, iris.target) if Version(sklearn_version) < '0.18': mean_scores = [] for params, mean_score, scores in grid.grid_scores_: mean_scores.append(round(mean_score, 2)) else: mean_scores = [ round(s, 2) for s in grid.cv_results_['mean_test_score'] ] assert mean_scores == [0.95, 0.96, 0.96, 0.95]
def test_EnsembleVoteClassifier_gridsearch(): clf1 = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft') params = { 'logisticregression__C': [1.0, 100.0], 'randomforestclassifier__n_estimators': [20, 200] } if Version(sklearn_version) < '0.24.1': grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, iid=False) else: grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) X, y = iris_data() grid.fit(X, y) mean_scores = [round(s, 2) for s in grid.cv_results_['mean_test_score']] assert mean_scores == [0.95, 0.96, 0.96, 0.95]
class VotingModel: def __init__(self, X, y, x_test, model_lists): self.model = EnsembleVoteClassifier(clfs=model_lists, weights=[1, 1, 1], refit=False, voting='soft') self.X = X self.y = y self.X_test = x_test def train(self): self.model.fit(self.X, self.y) def predict(self): return self.model.predict(self.X_test) def predict_proba(self): return self.model.predict_proba(self.X_test)
def test_clone(): clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) clone(eclf)
def test_clone(): clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) clone(eclf)
def fit(self, X, y): """ Train all the models in the ensemble. :param X: Features values of trainset :param y: Target values of trainset :return: --- """ # self._commit_models(X, y) if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._fit_single_model, X, y) self.models = pool.map(f, self.models) pool.close() pool.join() else: for model in self.models: self._fit_single_model(X, y, model) self.votingClassifier = EnsembleVoteClassifier(clfs=self._get_models(), voting=self.voting, refit=False) self.votingClassifier.fit(X, y)
def fit(self, X, values): #hard prediction for train_index, validation_index in KFold(n_splits=self.n_folds).split(X): train_set = X[train_index] train_values = values[train_index] validation_set = X[validation_index] validation_values = values[validation_index] fold_model = clone(self.template_model) fold_model.fit(train_set, train_values) #retrains a brand new model for the fold fold_regressor = KNeighborsRegressor(weights=self.weights, n_neighbors=self.n_neighbors) fold_regressor.fit(validation_set, fold_model.predict(validation_set) == validation_values) self.fold_regressions.append(fold_regressor) self.fold_models.append(fold_model) self.bagger = EnsembleVoteClassifier(self.fold_models, voting="soft", refit=False) self.bagger.fit(X, values) #trivial fit
def test_EnsembleVoteClassifier(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard') scores = cross_val_score(eclf, X, y, cv=5, scoring='accuracy') scores_mean = (round(scores.mean(), 2)) assert (scores_mean == 0.94)
# Majority voting with classifiers trained on different feature subsets # from sklearn.pipeline import Pipeline from mlxtend.feature_selection import SequentialFeatureSelector sfs1 = SequentialFeatureSelector(clf1, k_features=4, floating=False, scoring='accuracy', print_progress=False, cv=0) clf1_pipe = Pipeline([('sfs', sfs1), ('logreg', clf1)]) eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3], voting='soft') params = {'pipeline__sfs__k_features': [1, 2, 3], #'pipeline__logreg__C': [1,0, 100.0], 'randomforestclassifier__n_estimators': [20, 200]} grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) grid.fit(iris.data, iris.target) for params, mean_score, scores in grid.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std()/ 2, params)) print grid.best_params_ eclf = eclf.set_params(**grid.best_params_) print eclf.fit(X, y).predict(X[[1, 51, 149]])