def test_predictproba_hardvoting(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='hard') msg = "predict_proba is not available when voting='hard'" assert_raise_message(AttributeError, msg, eclf.predict_proba, X)
})).tolist())) newArr = [] for row in s: t = row.split(',') t = np.array(t) t = t.astype(float) newArr.append(t) accelArr = np.array(newArr) actionArr = np.array(dfs['Action']) # Create classifiers knn = KNeighborsClassifier(n_neighbors=3) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('knn', knn)], voting='hard') eclf1 = eclf1.fit(accelArr, actionArr) def detection_callback(device, advertisement_data): """Asynch Callback Args: device : Bleak device object advertisement_data : Advertisement Data Read """ global aidenBool global georgeBool try: if (device.address == aidenThingy or device.address
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=3, n_iter=10, verbose=10) random_search.fit(X_to_file, y_to_file[0]) print(random_search.best_params_) clf4 = SVC(**random_search.best_params_, probability=True) scores = cross_val_score(clf4, X_to_file, y, cv=9) print(scores) print(np.mean(scores)) from sklearn.ensemble import VotingClassifier eclf = VotingClassifier(estimators=[('rf', clf1), ('kn', clf2), ('nb', clf3), ('svm', clf4)], voting='soft') scores = cross_val_score(eclf, X_to_file, y, cv=9) print(scores) print(np.mean(scores)) ''' clf_list = [ RandomForestClassifier(n_estimators = 100, min_samples_leaf=2, min_samples_split=6), #SVC(kernel='rbf', degree=2, gamma='auto'), KNeighborsClassifier(n_neighbors=10, p=4), #GaussianNB(), MultinomialNB(),
from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( Nystroem(gamma=10.0, kernel="polynomial", n_components=10), make_union( VotingClassifier([("est", KNeighborsClassifier(n_neighbors=4, weights="distance"))]), FunctionTransformer(lambda X: X)), make_union( VotingClassifier([("est", ExtraTreesClassifier(criterion="entropy", max_features=1.0, n_estimators=500))]), FunctionTransformer(lambda X: X)), FeatureAgglomeration(affinity="precomputed", linkage="average"), GaussianNB()) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
# 2. 소프트 보팅: 확률 # 2. bagging # 여러개의 데이터셋 + 같은 ML # 3. boosting # 여러번 학습과 예측 # 첫번째 학습과 예측에서 잘못된 것을 두번째 학습에서 가중치 이용해서 수정해서 재학습/예측 반복 cancer = load_breast_cancer() df = pd.DataFrame(cancer.data, columns=cancer.feature_names) print(df) # 개별 모델 lc_r = LogisticRegression(max_iter=10000) # iter -> 최대 반복 횟수 knn_clf = KNeighborsClassifier(n_neighbors=4) # 개별 모델을 묶어 주는 보팅 vo_clf = VotingClassifier(estimators=[("LR", lc_r), ("KNN", knn_clf)], voting="soft") # 학습 데이터와 테스트 데이터로 분리 x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=11) # 학습과 예측 vo_clf.fit(x_train, y_train) prediction = vo_clf.predict(x_test) # 정확도 print("VotingClassifier 정확도:", accuracy_score(y_test, prediction)) # 개별 ML 정확도 models = [lc_r, knn_clf, vo_clf] for m in models: m.fit(x_train, y_train)
clf.fit(X_train, y_train) # Predict y_pred y_pred = clf.predict(X_test) # Calculate accuracy accuracy = accuracy_score(y_pred, y_test) # Evaluate clf's accuracy on the test set print('{:s} : {:.3f}'.format(clf_name, accuracy)) # Import VotingClassifier from sklearn.ensemble from sklearn.ensemble import VotingClassifier # Instantiate a VotingClassifier vc vc = VotingClassifier(estimators=classifiers) # Fit vc to the training set vc.fit(X_train, y_train) # Evaluate the test set predictions y_pred = vc.predict(X_test) # Calculate accuracy score accuracy = accuracy_score(y_pred, y_test) print('Voting Classifier: {:.3f}'.format( accuracy)) # Better accuracy than the three other classifiers # After Vorting Classifier, we have a look at bagging classifiers and regression # Import DecisionTreeClassifier
if acc_score > best_score: best_score = acc_score best_model = bag best_model pred = best_model.predict(X_test) accuracy_score(y_test, pred) # svc, random forest, and logistic regression in a voting classifier log_clf = LogisticRegression() rnd_clf = RandomForestClassifier(random_state=181) svm_clf = SVC(kernel='linear', random_state=181) voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard') for clf in (log_clf, rnd_clf, svm_clf, voting_clf): clf.fit(X_train, y_train) pred = clf.predict(X_test) print(clf.__class__.__name__, accuracy_score(y_test, pred)) # CV in the voting classifier (OBS takes approx. 20 min) log_clf = LogisticRegression() rnd_clf = RandomForestClassifier(random_state=181) svm_clf = SVC(kernel='linear', probability=True, random_state=181) voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
Features_10_folds.append(features) Labels_10_folds.append(labels) #for last fold all remaining features = data[data.columns[1:44]] labels = data[data.columns[44]] Features_10_folds.append(features) Labels_10_folds.append(labels) print Features_10_folds[0].shape clf = RandomForestClassifier(max_depth=8, random_state=0) mlp = MLPClassifier(hidden_layer_sizes=(50, 25)) gbt = GradientBoostingClassifier() ovr = OneVsRestClassifier(RandomForestClassifier()) eclf = VotingClassifier(estimators=[('gbt', gbt), ('ovr', ovr)], voting='soft') acc_RF = [] prec_RF = [] rec_RF = [] f1_RF = [] acc_MLP = [] prec_MLP = [] rec_MLP = [] f1_MLP = [] acc_GBT = [] prec_GBT = [] rec_GBT = [] f1_GBT = [] acc_ovr = [] prec_ovr = [] rec_ovr = []
def test_estimator_html_repr_pipeline(): num_trans = Pipeline( steps=[("pass", "passthrough"), ("imputer", SimpleImputer(strategy="median"))]) cat_trans = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", missing_values="empty")), ("one-hot", OneHotEncoder(drop="first")), ]) preprocess = ColumnTransformer([ ("num", num_trans, ["a", "b", "c", "d", "e"]), ("cat", cat_trans, [0, 1, 2, 3]), ]) feat_u = FeatureUnion([ ("pca", PCA(n_components=1)), ( "tsvd", Pipeline([ ("first", TruncatedSVD(n_components=3)), ("select", SelectPercentile()), ]), ), ]) clf = VotingClassifier([ ("lr", LogisticRegression(solver="lbfgs", random_state=1)), ("mlp", MLPClassifier(alpha=0.001)), ]) pipe = Pipeline([("preprocessor", preprocess), ("feat_u", feat_u), ("classifier", clf)]) html_output = estimator_html_repr(pipe) # top level estimators show estimator with changes assert html.escape(str(pipe)) in html_output for _, est in pipe.steps: assert ('<div class="sk-toggleable__content"><pre>' + html.escape(str(est))) in html_output # low level estimators do not show changes with config_context(print_changed_only=True): assert html.escape(str(num_trans["pass"])) in html_output assert "passthrough</label>" in html_output assert html.escape(str(num_trans["imputer"])) in html_output for _, _, cols in preprocess.transformers: assert f"<pre>{html.escape(str(cols))}</pre>" in html_output # feature union for name, _ in feat_u.transformer_list: assert f"<label>{html.escape(name)}</label>" in html_output pca = feat_u.transformer_list[0][1] assert f"<pre>{html.escape(str(pca))}</pre>" in html_output tsvd = feat_u.transformer_list[1][1] first = tsvd["first"] select = tsvd["select"] assert f"<pre>{html.escape(str(first))}</pre>" in html_output assert f"<pre>{html.escape(str(select))}</pre>" in html_output # voting classifier for name, est in clf.estimators: assert f"<label>{html.escape(name)}</label>" in html_output assert f"<pre>{html.escape(str(est))}</pre>" in html_output
# 决策树 dt_clf = DecisionTreeClassifier(random_state=666) dt_clf.fit(X_train, y_train) dt_score = dt_clf.score(X_test, y_test) y_predict1 = log_clf.predict(X_test) y_predict2 = svm_clf.predict(X_test) y_predict3 = dt_clf.predict(X_test) y_predict = np.array((y_predict1 + y_predict2 + y_predict3) >= 2, dtype='int') score1 = accuracy_score(y_test, y_predict) print(score1) # 使用Voting Classifier 少数服从多数 from sklearn.ensemble import VotingClassifier voting_clf = VotingClassifier(estimators=[ ('log_clf', LogisticRegression()), ('svm_clf', SVC()), ('dt_clf', DecisionTreeClassifier(random_state=666)) ], voting='hard') voting_clf.fit(X_train, y_train) score2 = voting_clf.score(X_test, y_test) print(score2) # 更合理的投票,应该有权值 # Hard Voting Classifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import VotingClassifier voting_clf = VotingClassifier(estimators=[ ('log_clf', LogisticRegression()), ('svm_clf', SVC()), ('dt_clf', DecisionTreeClassifier(random_state=666))
# 1.0 # <h4> Voting Classifier </h4> from sklearn.linear_model import LogisticRegression#importing logistc regression from sklearn.svm import SVC#importing Svm estimators = [] log_reg = LogisticRegression(solver='liblinear') estimators.append(('Logistic', log_reg)) tree = DecisionTreeClassifier() estimators.append(('Tree', tree)) svm_clf = SVC(gamma='scale') estimators.append(('svm', svm_clf)) voting = VotingClassifier(estimators=estimators) voting.fit(x_train, y_train) voting.fit(x_train,y_train) voting.score(x_test,y_test) # 0.8051948051948052 voting.score(x_train,y_train) #0.8110749185667753
df['carrier'] = pd.factorize(df['carrier'])[0] df['dest'] = pd.factorize(df['dest'])[0] test_x = enc.transform(df) print train_x.shape from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') eclf.fit(train_x.toarray(), train_y) # Evaluate on test set pr = eclf.predict(test_x.toarray()) # print results cm = confusion_matrix(test_y, pr) print "<------- VotingClassifier -------->" print "Confusion matrix:" print pd.DataFrame(cm) report_svm = precision_recall_fscore_support(list(test_y), list(pr), average='binary') print "\n[-] Precision = %0.2f\n[-] Recall = %0.2f\n[-] F1 score = %0.2f\n[-] Accuracy = %0.2f" % \
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # -------------------------------------------------------------------------------- # print_dividing_line('VotingClassifier') from sklearn.linear_model import LogisticRegression log_clf = LogisticRegression() from sklearn.svm import SVC svc_clf = SVC() from sklearn.ensemble import RandomForestClassifier rf_clf = RandomForestClassifier() from sklearn.ensemble import VotingClassifier voting_clf = VotingClassifier( estimators=[("log", log_clf), ("svc", svc_clf), ("rf", rf_clf)], voting="hard" ) # soft from sklearn.metrics import accuracy_score for clf in ( log_clf, svc_clf, rf_clf, voting_clf ): clf.fit( X_train, y_train ) y_pred = clf.predict( X_test ) print( clf.__class__.__name__, accuracy_score(y_test, y_pred) ) # -------------------------------------------------------------------------------- # print_dividing_line('BaggingClassifier') from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import BaggingClassifier bag_clf = BaggingClassifier( DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1 ) from sklearn.metrics import accuracy_score
tprRF, '', label="SuperStrength with Random Forest, auc= %0.2f" % aucRF) plt.title('Receiver Operating Characteristic') plt.xlabel('False Positive') plt.ylabel('True Positive') plt.legend(loc=4) plt.show() ''' ####################################### VOTING_CLASSIFIER ######################################## ''' from sklearn.ensemble import VotingClassifier votingClf = VotingClassifier(estimators=[('tr', classificadorTREE), ('rf', classificadorRF), ('nb', classificadorNB)], voting='soft', weights=[1.1, 2, 1]) for clf, label in zip( [classificadorTREE, classificadorRF, classificadorNB, votingClf], ['Decision Tree', 'Random Forest', 'Naive Bayes', 'Ensemble']): scores = cross_val_score(clf, previsores, classe, cv=5, scoring='accuracy') print("Accuracy: %0.2f [%s]" % (scores.mean(), label)) ''' ################################################################################################# ############################################ ENSEMBLE ########################################### ################################################################################################# '''
"slotprice", "creative", "keypage", "advertiser", "usertag", ] """ estimators = [] num_estimators = 11 models_already_created = False do_LR = True if models_already_created: for i in range(num_estimators): model = load("svm" + str(i)) estimators.append(("svm" + str(i), model)) v = VotingClassifier(estimators, n_jobs=-1) voting_bidder = Bidder(("voting", v), None) voting_bidder.train() voting_bidder.test() elif do_LR: """ kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) clf3 = GaussianProcessClassifier( kernel=kernel, max_iter_predict=100, multi_class="one_vs_one", n_jobs=-1, n_restarts_optimizer=5, ) b5 = Bidder(("gp", clf3), None) model5 = b5.train()
Xtr = scaler.fit_transform(Xtr) Xte = scaler.transform(Xte) for name, clf in Classifiers: try: clone_clf = clone(clf) clone_clf.fit(Xtr,ytr) y_pred = clone_clf.predict(Xte) df_sim[name] = [score(yte,y_pred)] except: print("Classifier %s failed to process dataset %s" % (name,Name)) df = pd.concat([df,df_sim]) df.to_csv("CSVs/%s.csv" % Name) return df VotingSVC = VotingClassifier([("RBF SVC",SVC(gamma="scale")), ("Linear SVC",SVC(kernel="linear")), ("Poly SVC",SVC(kernel="poly"))]) BaggingSVC = BaggingClassifier(base_estimator=SVC(gamma="scale"),n_estimators=10, random_state=0) Classifiers = [("Linear SVC",SVC(kernel="linear",gamma="scale")), ("RBF SVC",SVC(gamma="scale")), ("Poly SVC",SVC(kernel="poly",gamma="scale")), ("SVC Ensemble",VotingSVC), ("Bagging SVC",BaggingSVC), ("DEP",DEP()), ("r-DEP (Ensemble)",make_pipeline(EnsembleTransform(VotingSVC),StandardScaler(),DEP())), ("r-DEP (Bagging)",make_pipeline(EnsembleTransform(BaggingSVC),StandardScaler(),DEP())), ] AllDataSets = [ ("Breast Cancer Wisconsin","wdbc",1), ("Diabetes","diabetes",1),
m3 = RandomForestClassifier(n_estimators=80) models.append(('r_forest', m3)) m4 = RandomForestClassifier(n_estimators=90) models.append(('r_forest', m4)) m5 = KNeighborsClassifier(n_neighbors=1) models.append(('knn', m5)) m6 = KNeighborsClassifier(n_neighbors=2) models.append(('knn', m6)) m7 = KNeighborsClassifier(n_neighbors=3) models.append(('knn', m7)) m8 = KNeighborsClassifier(n_neighbors=4) models.append(('knn', m8)) m9 = KNeighborsClassifier(n_neighbors=5) models.append(('knn', m9)) # create voting ensemble e = VotingClassifier(models, weights=[0.8, 0.9, 1, 1.1, 1.1, 1.1, 1, 0.9, 0.8]) e.fit(train_X.values, np.ravel(train_Y.values)) preds = e.predict(test_X.values) print(accuracy_score(np.ravel(test_Y.values), preds)) ''' df_pred = pd.DataFrame(preds, columns=['coverType_1to7']) df_pred.insert(loc=0, column='id', value=np.ravel(df_test_X_ids.values)) print(df_pred[:10]) df_pred.to_csv('voting_4forst_5knn_weights.csv', index=False) ''' # get cv result # result = model_selection.cross_val_score(e, df_X.values, np.ravel(df_Y.values) # , cv=k) # print(result.mean()) # predictions = clf.predict(df_test_X.values[:20000])
print("After Standardization\nMean ", np.mean(X_train), "Standard Deviation ", np.std(X_train), "\n") #Voting ensemble method. Combining all tree based algorithms. models = [] models.append(("XGB", XGBClassifier())) models.append(("RF", RandomForestClassifier())) models.append(("DT", DecisionTreeClassifier())) models.append(("ADB", AdaBoostClassifier())) models.append(("GB", GradientBoostingClassifier())) ############################################################################# # test and train the upsampled data against classifiers # to find the optimum prediction ############################################################################# ensemble = VotingClassifier(estimators=models) ensemble.fit(X_train, y_train) y_pred = ensemble.predict(X_test) print(classification_report(y_pred, y_test)) print("Voting Ensemble:>", accuracy_score(y_pred, y_test)) SVM = SVC(kernel="linear", class_weight="balanced", probability=True) SVM.fit(X_train, y_train) y_pred = SVM.predict(X_test) print(classification_report(y_pred, y_test)) print("SVM: ", accuracy_score(y_pred, y_test)) XGBC = XGBClassifier(learning_rate=0.1, n_estimators=10000, max_depth=4, min_child_weight=6,
def perform(): try: filename = session['filename'] data = pd.read_csv(filename, header=0) array = data.values X = array[:,0:8] y = array[:,8] X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.33,random_state=1) # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr'))) #models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier(criterion="entropy"))) models.append(('NB', GaussianNB())) #models.append(('SVM', SVC(gamma='auto'))) #models.append(('ANN', MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500, random_state=42))) models.append(('RF', RandomForestClassifier())) models.append(('BG',BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features =1.0, n_estimators =20))) models.append(('ADA',AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1))) LR=LogisticRegression(solver='liblinear', multi_class='ovr') LDA=LinearDiscriminantAnalysis() KNN=KNeighborsClassifier() CART=DecisionTreeClassifier(criterion="entropy") RF=RandomForestClassifier() NB=GaussianNB() SVM=SVC(gamma='auto') ANN=MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500, random_state=42) BG=BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features = 1.0, n_estimators =20) ADA=AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1) models.append(('VOTE',VotingClassifier( estimators= [('RF',RF),('BG',BG)], voting = 'hard'))) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') results.append(cv_results) names.append(name) #print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())) # Compare Algorithms plt.boxplot(results, labels=names) plt.title('Algorithm Comparison') plt.plot() session['strFile']="./static/images/perf.png" strFile = "./static/images/perf.png" if os.path.isfile(strFile): os.remove(strFile) plt.savefig(strFile) plt.close() except KeyError: flash('Dataset not uploaded!') #plt.savefig('/home/saurabh/Desktop/DPS/static/images/perf.png') return render_template('dashboard.html', name = 'Plot Showing Accuracy of Different Algorithms:', url ='/static/images/perf.png')
for clf, label in zip([rfc, gbc, lgr], clf_labels): scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring="roc_auc") print("ROC AUC: %0.2F (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) print( "------------------------VotingClassifier集成模型-------------------------------" ) ensemble_clf = VotingClassifier(estimators=[ ('RandomForestClassifier', rfc), ('GradientBoostingClassifier', gbc), ('LogisticRegression', lgr) ], voting='soft', weights=[1, 1, 1], flatten_transform=True) ensemble_clf.fit(X_train, y_train) preds = ensemble_clf.predict(X_test) print("VotingClassifier ROC AUC:%.3f" % roc_auc_score(y_true=y_test, y_score=preds)) print("VotingClassifier accuracy_scorer:%.3f" % accuracy_score(y_true=y_test, y_pred=preds)) clf_labels = [ "RandomForestClassifier", "GradientBoostingClassifier", "LogisticRegression", "VotingClassifier" ]
class HogFaceClassifier: svc_pipeline = Pipeline([ # ('preprocess', FunctionTransformer(get_face_hog)), ('classifier', SVC(C=10, kernel='poly', gamma=1, shrinking=False, class_weight='balanced', probability=True, tol=0.001, cache_size=10000, max_iter=-1, verbose=0)) ]) et_pipeline = Pipeline([ # ('preprocess', FunctionTransformer(get_face_hog)), ('classifier', ExtraTreesClassifier(n_estimators=10000, criterion='entropy', max_features=0.2, verbose=0, n_jobs=2)) ]) ens = VotingClassifier(estimators=[ ('svc', svc_pipeline), ('et', et_pipeline), ], voting='soft', weights=[10, 1], n_jobs=2) def __init__(self, binary_classification: bool = False, params: dict = None): self.binary_classification = binary_classification #self.ens = self.et_pipeline print(str(self.ens.get_params().keys())) if params is not None: self.ens.set_params(**params) def fit(self, x, y): self.ens.fit(x, y) def save(self, file): joblib.dump(self.ens, file) def load(self, file): self.ens = joblib.load(file) def cv_test(self, x, y): score = cross_val_score(self.ens, x, y, cv=3, verbose=3, n_jobs=3) return score.mean() def prediction(self, X): return self.ens.predict(X) def evaluate(self, x_test, y_test) -> ClassificationResults: preds = self.prediction(x_test) pred_probs = self.ens.predict_proba(x_test) acc = accuracy_score(from_hot_one(y_test), preds) results = ClassificationResults(labels=y_test, preds=preds, pred_probs=pred_probs, acc=acc, binary=self.binary_classification) return results
def file_output( self, Y_optimization_pred: np.ndarray, Y_valid_pred: np.ndarray, Y_test_pred: np.ndarray, ) -> Tuple[Optional[float], Dict[str, Union[str, int, float, List, Dict, Tuple]]]: # Abort if self.Y_optimization is None # self.Y_optimization can be None if we use partial-cv, then, # obviously no output should be saved. if self.Y_optimization is None: return None, {} # Abort in case of shape misalignment if np.shape(self.Y_optimization)[0] != Y_optimization_pred.shape[0]: return ( 1.0, { 'error': "Targets %s and prediction %s don't have " "the same length. Probably training didn't " "finish" % (np.shape(self.Y_optimization), Y_optimization_pred.shape) }, ) # Abort if predictions contain NaNs for y, s in [ # Y_train_pred deleted here. Fix unittest accordingly. [Y_optimization_pred, 'optimization'], [Y_valid_pred, 'validation'], [Y_test_pred, 'test'] ]: if y is not None and not np.all(np.isfinite(y)): return ( 1.0, { 'error': 'Model predictions for %s set contains NaNs.' % s }, ) # Abort if we don't want to output anything. # Since disable_file_output can also be a list, we have to explicitly # compare it with True. if self.disable_file_output is True: return None, {} # Notice that disable_file_output==False and disable_file_output==[] # means the same thing here. if self.disable_file_output is False: self.disable_file_output = [] # Here onwards, the self.disable_file_output can be treated as a list self.disable_file_output = cast(List, self.disable_file_output) # This file can be written independently of the others down bellow if ('y_optimization' not in self.disable_file_output): if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) models: Optional[BaseEstimator] = None if hasattr(self, 'models'): if len(self.models) > 0 and self.models[ 0] is not None: # type: ignore[attr-defined] if ('models' not in self.disable_file_output): if self.task_type in CLASSIFICATION_TASKS: models = VotingClassifier( estimators=None, voting='soft', ) else: models = VotingRegressor(estimators=None) # Mypy cannot understand hasattr yet models.estimators_ = self.models # type: ignore[attr-defined] self.backend.save_numrun_to_dir( seed=self.seed, idx=self.num_run, budget=self.budget, model=self.model if 'model' not in self.disable_file_output else None, cv_model=models if 'cv_model' not in self.disable_file_output else None, ensemble_predictions=(Y_optimization_pred if 'y_optimization' not in self.disable_file_output else None), valid_predictions=(Y_valid_pred if 'y_valid' not in self.disable_file_output else None), test_predictions=(Y_test_pred if 'y_test' not in self.disable_file_output else None), ) return None, {}
from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import VotingClassifier # In[42]: clf1=LogisticRegression(random_state=101) clf2=RandomForestClassifier(random_state=101) clf3=GaussianNB() # In[43]: X=df.drop(['Observed Attendance'],axis=1) y=df['Observed Attendance'] eclf1=VotingClassifier(estimators=[('lr',clf1),('rf',clf2),('gnb',clf3)],weights=(1,2,3),voting='hard') eclf1=eclf1.fit(X_train,y_train) eclf1=eclf1.fit(X,y) print(eclf1.predict(X)) # In[44]: eclf2=VotingClassifier(estimators=[('lr',clf1),('rf',clf2),('gnb',clf3)],weights=(1,2,3),voting='hard') eclf2=eclf2.fit(X_train,y_train) predict=eclf2.predict(X_test) print(classification_report(y_test,predict)) # In[45]:
print('Sensitivity_std:%f' % r_s) print('Specificity_std:%f' % s_s) print('f1_std:%f' % f_s) if __name__ == '__main__': # clf_lr = MLPClassifier(activation='relu',alpha=0.005) # clf_lr.fit(data.iloc[:, :-1], data.iloc[:, -1]) clf_lr = LogisticRegression(C=0.07) ada_lr = AdaBoostClassifier(clf_lr, n_estimators=20) clf_svm = SVC(gamma=30, probability=True) ada_svm = AdaBoostClassifier(clf_svm, n_estimators=20, algorithm="SAMME") clf_nb = MultinomialNB(alpha=10) ada_nb = AdaBoostClassifier(clf_nb, n_estimators=20) # clf_dt=DecisionTreeClassifier() # ada_dt=AdaBoostClassifier(clf_dt,n_estimators=20, learning_rate=0.5) voting_clf = VotingClassifier(estimators=[("ada_lr", ada_lr), ("ada_svm", ada_svm), ("ada_nb", ada_nb)], voting='soft') #,weights=[1.2,1,1] for clf in [("lr", clf_lr), ("svm", clf_svm), ("NB", clf_nb), ('ensemble', voting_clf)]: print(clf[0]) result(data, clf[1]) for clf in [("lr", clf_lr), ("svm", clf_svm), ("NB", clf_nb), ('ensemble', voting_clf)]: print(clf[0]) std(data, clf[1])
print('************Stats of '+col+'****************') print(encoded_ds[col].describe()) plt.hist(encoded_ds[col]) plt.show() print('************End of stats for '+col+'*************') print('\n') #build X and y X = encoded_ds.iloc[:,0:-1] #all columns except last column y = integer_encoded_churn X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3) clf_knn = KNeighborsClassifier(n_neighbors=5) clf_rf = RandomForestClassifier(random_state=1, n_estimators=100, max_depth=100, max_leaf_nodes=100) clf_lr = LogisticRegression(class_weight="balanced") estimators = [('knn', clf_knn), ('lr', clf_lr), ('dt', clf_rf)] clf_avg = VotingClassifier(estimators,voting='soft') clf_avg.fit(X_train,y_train) print(accuracy_score(y_test, clf_avg.predict(X_test))) y_pred = [0, 2, 1, 3] y_true = [0, 1, 2, 3] print(accuracy_score(y_true, y_pred)) plt.figure(figsize=(12,10)) cor = encoded_ds.corr() sns.heatmap(cor, annot=True, cmap=plt.cm.Reds) plt.show()
# The prediction seems to be quite similar for the 5 classifiers except when Adaboost is compared to the others classifiers. # # The 5 classifiers give more or less the same prediction but there is some differences. Theses differences between the 5 classifier predictions are sufficient to consider an ensembling vote. # ### 6.2 Ensemble modeling # #### 6.2.1 Combining models # # I choosed a voting classifier to combine the predictions coming from the 5 classifiers. # # I preferred to pass the argument "soft" to the voting parameter to take into account the probability of each vote. # In[75]: votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best), ('svc', SVMC_best), ('adac',ada_best),('gbc',GBC_best)], voting='soft', n_jobs=4) votingC = votingC.fit(X_train, Y_train) # ### 6.3 Prediction # #### 6.3.1 Predict and Submit results # In[76]: test_Survived = pd.Series(votingC.predict(test), name="Survived") results = pd.concat([IDtest,test_Survived],axis=1) results.to_csv("ensemble_python_voting.csv",index=False)
# k fold cross validation kfolds = KFold(n_splits=10, random_state=0) # create the sub models estimators = [] model1 = LogisticRegression() estimators.append(('logistic', model1)) model2 = DecisionTreeClassifier() estimators.append(('cart', model2)) model3 = SVC() estimators.append(('svm', model3)) estimators ensemble = VotingClassifier(estimators) results = cross_val_score(ensemble, X_train, y_train, cv=kfolds) print(results.mean()) modelfit = ensemble.fit(X_train, y_train) y_pred = modelfit.predict(X_test) y_pred from sklearn.metrics import confusion_matrix, accuracy_score, classification_report accuracy_score(y_test, y_pred) cm = confusion_matrix(y_test, y_pred) cm
import numpy as np from sklearn.cross_validation import train_test_split from sklearn.ensemble import AdaBoostClassifier, VotingClassifier from sklearn.kernel_approximation import Nystroem from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( make_union(VotingClassifier([('branch', AdaBoostClassifier(learning_rate=1.0, n_estimators=500) )]), FunctionTransformer(lambda X: X)), FunctionTransformer(lambda X: X) ), Nystroem(gamma=8.0, kernel="polynomial", n_components=27), LogisticRegression(C=0.0001, dual=False, penalty="l1") ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
print("-------------------------------------------------") #define a decision tree model using entropy based information gain #decTreeModel2 = tree.DecisionTreeClassifier(criterion='entropy') #decTreeModel2 = AdaBoostClassifier() #decTreeModel2 = GaussianNB() #decTreeModel2 = GradientBoostingClassifier() #decTreeModel2 = BaggingClassifier() clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GradientBoostingClassifier() clf4 = SVC() decTreeModel2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('bc', clf4)], voting='hard') #decTreeModel2 = LogisticRegression(random_state=1) #decTreeModel2 = LogisticRegression(random_state=1) #train_dfs = preprocessing.normalize(train_dfs) #Split the data: 60% training : 40% test set instances_train, instances_test, target_train, target_test = cross_validation.train_test_split( train_dfs, targetLabels, test_size=0.4, random_state=0) #fit the model using just the test set decTreeModel2.fit(instances_train, target_train) #Use the model to make predictions for the test set queries predictions = decTreeModel2.predict(instances_test) #Output the accuracy score of the model on the test set print("Accuracy= " +
assert_array_equal(eclf2.transform(X).shape, (4, 6)) assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) ) @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 @pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 @pytest.mark.parametrize( "X, y, voter", [(X, y, VotingClassifier( [('lr', LogisticRegression()), ('rf', RandomForestClassifier(n_estimators=5))])), (X_r, y_r, VotingRegressor( [('lr', LinearRegression()), ('rf', RandomForestRegressor(n_estimators=5))]))] ) @pytest.mark.parametrize("drop", [None, 'drop']) def test_none_estimator_with_weights(X, y, voter, drop): # check that an estimator can be set to None and passing some weight # regression test for # https://github.com/scikit-learn/scikit-learn/issues/13777 voter.fit(X, y, sample_weight=np.ones(y.shape)) voter.set_params(lr=drop) voter.fit(X, y, sample_weight=np.ones(y.shape)) y_pred = voter.predict(X) assert y_pred.shape == y.shape