def stacking(_stacking_model_list, _final_clf, _metric, X_train, X_val, y_train, y_val, X_test, _cv): # Might wanna consider remove _final_clf from the _stacking_model_list sclf = StackingCVClassifier(classifiers=_stacking_model_list, use_probas=True, meta_classifier=_final_clf, random_state=42) scores = model_selection.cross_val_score(sclf, X_train, y_train, cv=_cv, scoring=_metric) print('Cross-validated score:', scores) print('-' * 20) predicted_probas = sclf.predict_proba(X_val) y_true = y_val y_probas = predicted_probas skplt.metrics.plot_roc_curve(y_true, y_probas) plt.savefig(r'Result/' + sclf.__class__.__name__ + 'stacking.png') plt.show() prediction = sclf.predict_proba(X_test)[:, 1] return prediction
# Building and running the StackingClassifier on the test data from mlxtend.classifier import StackingCVClassifier sclf=StackingCVClassifier(classifiers=[rf,lr,gb,et,gnb,svc,knn,xgb,ada,mlp,lda,qda], use_features_in_secondary=True, use_probas=True, meta_classifier=eclf) cmetrics=[] cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='accuracy').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='precision').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='recall').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='roc_auc').mean()) sclf.fit(X.values,y.values) pred=sclf.predict(Xt.values) # plotting ROC-Curve pred_proba=sclf.predict_proba(Xt.values)[:,1] fpr, tpr, threshold = roc_curve(yt, pred_proba) roc_auc=auc(fpr,tpr) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig('ROC_curve_test.png',bbox_inches='tight') plt.clf() perf = pd.read_csv('performance_estimates.csv') metrics=[] metrics.append(accuracy_score(yt,pred))
num_folds = 6 folds = KFold(n_splits=num_folds, shuffle=True) test_result = np.zeros(len(test)) auc_score = 0 for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, targets)): print("Fold: ", fold_ + 1) X_train, y_train = train.iloc[trn_idx], targets.iloc[trn_idx] X_valid, y_valid = train.iloc[val_idx], targets.iloc[val_idx] sclf.fit(X_train.values, y_train.values) y_pred = sclf.predict_proba(X_valid) auc = roc_auc_score(y_valid, y_pred[:, 1]) print(auc) auc_score += auc preds = sclf.predict_proba(test) test_result += preds[:, 1] # print the average AUC across the folds and compute the final results on the test data auc_score = auc_score / folds.n_splits print("AUC score: ", auc_score) test_result = test_result / folds.n_splits # create the submission submission = pd.DataFrame({ 'Id' : test['Id'],
class StackingDemo(object): def __init__(self): # data prepare self.__iris = None self.__X = None self.__y = None self.__train, self.__train_label = [None for _ in range(2)] self.__test, self.__test_label = [None for _ in range(2)] # function set self.__params = None self.__lr = None self.__gb = None self.__rf = None self.__sclf = None self.__grid = None def data_prepare(self): self.__iris = load_iris() self.__X = self.__iris.data[0:100] self.__y = self.__iris.target[0:100] self.__train, self.__test, self.__train_label, self.__test_label = train_test_split( self.__X, self.__y, test_size=0.2, shuffle=True) def function_set(self): # param self.__params = { # 注意名称必须是这样 "logisticregression__C": list(np.linspace(start=0.1, stop=10, num=5)), "gradientboostingclassifier__learning_rate": list(np.linspace(start=0.1, stop=1, num=10)), "randomforestclassifier__n_estimators": list(range(5, 16)), "meta-logisticregression__C": list(np.linspace(start=0.1, stop=10, num=5)) } # model self.__lr = LogisticRegression() self.__gb = GradientBoostingClassifier() self.__rf = RandomForestClassifier() self.__sclf = StackingCVClassifier( classifiers=[self.__lr, self.__gb, self.__rf], meta_classifier=self.__lr, use_probas=True, cv=5, use_features_in_secondary=True, verbose=1) self.__grid = GridSearchCV(estimator=self.__sclf, param_grid=self.__params, cv=5, refit=True) def goodness_of_function(self): self.__grid.fit(self.__train, self.__train_label) print("Best parameters: %s" % self.__grid.best_params_) print("Accuracy: %.2f" % self.__grid.best_score_) def pick_the_best_function(self): self.__lr = LogisticRegression(C=0.1) self.__gb = GradientBoostingClassifier(learning_rate=0.1) self.__rf = RandomForestClassifier(n_estimators=5) self.__sclf = StackingCVClassifier( classifiers=[self.__lr, self.__gb, self.__rf], meta_classifier=self.__lr, use_probas=True, cv=5, use_features_in_secondary=True, verbose=1) self.__sclf.fit(self.__train, self.__train_label) print( roc_auc_score(self.__test_label, self.__sclf.predict_proba(self.__test)[:, 1]))
class StackingBaseline(object): def __init__(self, *, path): self.__path = path self.__application_train = None self.__application_test = None self.__sample_submission = None # data prepare self.__application_train_feature = None self.__application_train_label = None self.__application_test_feature = None self.__categorical_columns = None self.__numeric_columns = None # numeric handle # categorical handle self.__encoder = None # model fit self.__lr = None self.__ef = None self.__rf = None self.__gb = None self.__xgb = None self.__sclf = None def data_prepare(self): self.__application_train = pd.read_csv( os.path.join(self.__path, "application_train.csv")) self.__application_test = pd.read_csv( os.path.join(self.__path, "application_test.csv")) self.__sample_submission = pd.read_csv( os.path.join(self.__path, "sample_submission.csv")) self.__application_train = self.__application_train.drop("SK_ID_CURR", axis=1) self.__application_test = self.__application_test.drop("SK_ID_CURR", axis=1) self.__application_train_feature = self.__application_train[[ i for i in self.__application_train.columns if i != "TARGET" ]] self.__application_train_label = self.__application_train["TARGET"] self.__application_test_feature = self.__application_test self.__categorical_columns = self.__application_train_feature.select_dtypes( include=["object"]).columns.tolist() self.__numeric_columns = [ i for i in self.__application_train_feature.columns if i not in self.__categorical_columns ] def numeric_handle(self): self.__application_train_feature[ self.__numeric_columns] = self.__application_train_feature[ self.__numeric_columns].fillna(-999.0) self.__application_test_feature[ self.__numeric_columns] = self.__application_test_feature[ self.__numeric_columns].fillna(-999.0) def categorical_handle(self): self.__application_train_feature[self.__categorical_columns] = ( self.__application_train_feature[ self.__categorical_columns].fillna("missing")) self.__encoder = LeaveOneOutEncoder() self.__encoder.fit( self.__application_train_feature[self.__categorical_columns], self.__application_train_label) self.__application_train_feature[ self.__categorical_columns] = self.__encoder.transform( self.__application_train_feature[self.__categorical_columns]) self.__application_test_feature[ self.__categorical_columns] = self.__encoder.transform( self.__application_test_feature[self.__categorical_columns]) def model_fit(self): self.__ef = ExtraTreesClassifier(n_jobs=-1) self.__rf = RandomForestClassifier(n_jobs=-1) self.__lr = LogisticRegression() self.__gb = GradientBoostingClassifier() self.__xgb = XGBClassifier(n_jobs=-1, missing=-999.0) self.__sclf = StackingCVClassifier( classifiers=[self.__ef, self.__rf, self.__gb, self.__xgb], meta_classifier=self.__lr, use_probas=True, cv=3) self.__sclf.fit(self.__application_train_feature.values, self.__application_train_label.values) def model_predict(self): self.__sample_submission["TARGET"] = np.clip( self.__sclf.predict_proba( self.__application_test_feature.values)[:, 1], 0, 1) self.__sample_submission.to_csv( '/Users/David/Desktop/0.Home default risk/submission/stack_baseline', index=False)
# sclf = StackingCVClassifier(classifiers=[xgb, xtrees, rf, gb], meta_classifier=lr, use_probas=True, cv=5, verbose=2) # sclf.fit(X_train.values, y_train.values) # sclf_y_pred_proba = sclf.predict_proba(X_test.values)[:,1] # gini_norm(y_test, sclf_y_pred_proba) # # 0.2777: not much better than cv 3... # Try out some more regularization for Logit sclf = StackingCVClassifier(classifiers=[xgb, xtrees, rf, gb], meta_classifier=LogisticRegression(C=0.1), use_probas=True, cv=3, verbose=2) sclf.fit(X_train.values, y_train.values) sclf_y_pred_proba = sclf.predict_proba(X_test.values)[:, 1] gini_norm(y_test, sclf_y_pred_proba) # 0.2710, 0.2718 sclf.meta_clf_.coef_ sclf.meta_clf_.intercept_ sclf.meta_clf_.n_iter_ # __max_iter=300 # class_weight='balanced' # penalty='l1'__ # sclf = StackingCVClassifier(classifiers=[xgb, xtrees, rf, gb], meta_classifier=LogisticRegression(max_iter=300, class_weight='balanced'), use_probas=True, cv=3, verbose=2) # sclf.fit(X_train.values, y_train.values)
1, 0, 1, 1, 0, 1 ]) metaClassifier = CalibratedClassifierCV(EnsembleRegression( x0, list(clfs.keys()), le.classes_), method='isotonic', cv=META_FOLDS) sclf = StackingCVClassifier(classifiers=pipes, meta_classifier=metaClassifier, use_clones=False, use_probas=True, cv=FOLDS, verbose=1) sclf.fit(data.values, labelsEncoded, groups=None, **weightsPerClassifier) print('StackingCV classifier is fitted in ' + str(datetime.now() - start)) start = datetime.now() test = pd.read_csv('data/test_users_norm.csv').fillna(NA_CONST) Xid = test.pop('id') saveResult(Xid, sclf.predict_proba(test.values), le, 'predict/stacking.csv') print('Submission predict/stacking.csv is predicted in ' + str(datetime.now() - start)) trainPredicted = sclf.predict_proba(data.values) print('Test set nDCG5 score: ' + str(nDCG5(labelsEncoded, trainPredicted))) print('Total time: ' + str(datetime.now() - totalStart))
n_jobs=8) svc = SVC(kernel='rbf', random_state=2018, probability=True, gamma='auto') lr = LogisticRegression(max_iter=1000, solver='lbfgs', penalty='l2', n_jobs=8) models = [rf, xgb, lgb, svc] y_pred_self, y_prob_self = StackingModels(models=models, meta_model=lr, X_train=X_train, X_test=X_test, y_train=y_train) acc = accuracy_score(y_test, y_pred_self) auc = roc_auc_score(y_test, y_prob_self) print('MyModel: ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc)) stack_clf = StackingCVClassifier(classifiers=models, meta_classifier=lr, cv=5).fit(X_train, y_train) y_pred_mxltend, y_prob_mxltend = stack_clf.predict( X_test), stack_clf.predict_proba(X_test)[:, -1] acc = accuracy_score(y_test, y_pred_mxltend) auc = roc_auc_score(y_test, y_prob_mxltend) print('Mlxtend: ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc)) X, y = make_regression(n_samples=5000, n_features=20, n_informative=18, random_state=2018) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018) X_train, X_test = map(scaler.fit_transform, [X_train, X_test]) rf = RandomForestRegressor(n_estimators=50,
ExtraTreesClassifier(n_estimators=1000, max_depth=2, n_jobs=8), ExtraTreesClassifier(n_estimators=1000, max_depth=4, n_jobs=8), ExtraTreesClassifier(n_estimators=1000, max_depth=10, n_jobs=8), ExtraTreesClassifier(n_estimators=1000, max_depth=30, n_jobs=8), ] lr = ExtraTreesClassifier(n_estimators=1000, max_depth=30, n_jobs=8) model = StackingCVClassifier(classifiers=clfList, use_probas=True, use_features_in_secondary=True, meta_classifier=lr, cv=20, random_state=15, verbose=1) model.fit(x_train, y_train) #y_pred = sclf.predict(x_test) #score(y_pred, y_test) #model = load("../models/catboost_model.pkl") y_pred = model.predict(x_val) scores = get_all_scores(y_pred, y_val) print(scores) y_pred = model.predict(x_test) scores = get_all_scores(y_pred, y_test) print(scores) probas_test = model.predict_proba(x_test) save(model, "../models/ex_stack.pkl") print("done")
validation_fraction=0.1, verbose=False, warm_start=False) lr = BaggingClassifier(LogisticRegression(random_state=RANDOM_SEED, penalty='l1', C=0.1), max_samples=0.8, max_features=0.8) sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf5, clf7, clf8], use_probas=True, meta_classifier=lr) sclf.fit(train_cm_x.values, train_cm_y.values) predict_y = sclf.predict_proba(test_cm.values)[:-1] df = pd.DataFrame(predict_y) df.to_csv("predicted_y.csv") print('5-fold cross validation:\n') for clf, label in zip([clf1, clf2, clf3, clf5, clf7, clf8, sclf], [ 'KNN', 'Extreme gradient boosting', 'bagging Logistic Regression', 'Linear SVC', 'Extra Tree', 'Neural Network', 'StackingClassifier' ]): scores = cross_val_score(clf, train_cm_x.values, train_cm_y.values, cv=5, scoring='roc_auc')
def stacking_classifier( train, validation, refit='yes', use_saved_model='no', save_model='yes', to_plot='yes', meta_leaner_parameters={ 'max_depth': 20, "n_estimators": 20, "learning_rate": 0.05, 'silent': False, 'n_jobs': 3, 'subsample': 1, 'objective': 'binary:logistic', 'colsample_bytree': 1, 'eval_metric': "auc", 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'random_state': 500 }, stacking_cv_parameters={ 'use_probas': False, 'use_features_in_secondary': True, 'cv': 5, 'store_train_meta_features': True, 'refit': True }): if use_saved_model == 'no': # Import all the grid searched models # Logistic model_filename = os.getcwd() + "/models/logistic_model.pkl" log_model = joblib.load(model_filename) # Random Forest model_filename = os.getcwd() + "/models/rf_model.pkl" rf_model = joblib.load(model_filename) # Extreme Random Forest model_filename = os.getcwd() + "/models/erf_model.pkl" erf_model = joblib.load(model_filename) # XGBoost model_filename = os.getcwd() + "/models/xgb_model.pkl" xgb_model = joblib.load(model_filename) # SVM model_filename = os.getcwd() + "/models/svm_model.pkl" svm_model = joblib.load(model_filename) # Naive Bayes model_filename = os.getcwd() + "/models/nb_model.pkl" nb_model = joblib.load(model_filename) # Neural Network model_filename = os.getcwd() + "/models/nn_model.pkl" nn_model = joblib.load(model_filename) meta_learner = xgboost.XGBClassifier( max_depth=meta_leaner_parameters['max_depth'], n_estimators=meta_leaner_parameters['n_estimators'], learning_rate=meta_leaner_parameters['learning_rate'], silent=meta_leaner_parameters['silent'], n_jobs=meta_leaner_parameters['n_jobs'], subsample=meta_leaner_parameters['subsample'], objective=meta_leaner_parameters['objective'], colsample_bytree=meta_leaner_parameters['colsample_bytree'], eval_metric=meta_leaner_parameters['eval_metric'], reg_alpha=meta_leaner_parameters['reg_alpha'], reg_lambda=meta_leaner_parameters['reg_lambda'], random_state=meta_leaner_parameters['random_state']) model = StackingCVClassifier( classifiers=[rf_model, erf_model, xgb_model], meta_classifier=meta_learner, use_probas=stacking_cv_parameters['use_probas'], use_features_in_secondary=stacking_cv_parameters[ 'use_features_in_secondary'], store_train_meta_features=stacking_cv_parameters[ 'store_train_meta_features'], cv=stacking_cv_parameters['cv']) model = model.fit( train.drop(['click', 'bidprice', 'payprice'], axis=1).values, train['click'].values) prediction = model.predict_proba( validation.drop(['click', 'bidprice', 'payprice'], axis=1).values) else: # Load from saved files model_filename = os.getcwd() + "/models/stacked_model.pkl" saved_model = joblib.load(model_filename) if refit == 'yes': # If refit, run model = saved_model.fit( train.drop(['click', 'bidprice', 'payprice'], axis=1).values, train['click'].values) # Make prediction prediction = model.predict_proba( validation.drop(['click', 'bidprice', 'payprice'], axis=1).values) else: prediction = saved_model.predict_proba( validation.drop(['click', 'bidprice', 'payprice'], axis=1).values) model = saved_model # Whether to save the model if save_model == 'yes': print('Saving the stacked model to the disc.') model_filename = os.getcwd() + "/models/stacked_model.pkl" joblib.dump(model, model_filename, compress=9) # Print scores print("AUC: %0.5f for Stacking Model" % (roc_auc_score(validation['click'], prediction[:, 1]))) if to_plot == 'yes': plot_ROC_curve(validation['click'], prediction[:, 1]) return model, prediction[:, 1] ####################### END ########################
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = [x + f"cauc" for x in experiments] checksum = compute_checksum_v2(fnames_for_checksum) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y(holdout_predictions) print(x.shape, y.shape) x_test, _ = get_x_y(test_predictions) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) auc_cv = [] classifier1 = LGBMClassifier() classifier2 = CatBoostClassifier() classifier3 = LogisticRegression() classifier4 = CalibratedClassifierCV() classifier5 = LinearDiscriminantAnalysis() sclf = StackingCVClassifier( classifiers=[ classifier1, classifier2, classifier3, classifier4, classifier5 ], shuffle=False, use_probas=True, cv=4, # meta_classifier=SVC(degree=2, probability=True), meta_classifier=LogisticRegression(solver="lbfgs"), ) sclf.fit(x, y, groups=image_ids) classifiers = { "LGBMClassifier": classifier1, "CatBoostClassifier": classifier2, "LogisticRegression": classifier3, "CalibratedClassifierCV": classifier4, "LinearDiscriminantAnalysis": classifier5, "Stack": sclf, } # Get results for key in classifiers: # Make prediction on test set y_pred = classifiers[key].predict_proba(x_valid)[:, 1] print(key, alaska_weighted_auc(y_valid, y_pred)) # Making prediction on test set y_test = sclf.predict_proba(x_test)[:, 1] df["Label"] = y_test df.to_csv(os.path.join(output_dir, f"stacking_{np.mean(auc_cv):.4f}_{checksum}.csv"), index=False)
#Method 2 clf1 = XGBClassifier(learning_rate=0.5, n_estimators=300, max_depth=5, gamma=0, subsample=0.8, verbose=1) #clf1 = XGBClassifier(learning_rate =0.5,n_estimators=300,max_depth=5,gamma=0,subsample=0.8) clf2 = RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion="entropy") clf3 = ExtraTreesClassifier(n_jobs=-1, n_estimators=5, criterion="entropy") lr = LogisticRegression(n_jobs=-1, C=8) sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, verbose=100) # for clf, label in zip([clf1, clf2, clf3, sclf], # ['XGBoost', # 'Random Forest', # 'Extra Tree', # 'StackingClassifier']): # scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy') # print("Accuracy: %0.2f (+/- %0.2f) [%s]" # % (scores.mean(), scores.std(), label)) sclf.fit(X_train, y_train) print("training finished") #y_pre = sclf.predict(X_test) y_pre = sclf.predict_proba(X_test)[:, 1] print("roc:{0:.3f}".format(roc_auc_score(y_test, y_pre)))