def stacking_model_lgb_gbt(task='together'): nfold = 5 #task='together' train_df = None test_df = None if task == 'together': train_df = pd.read_csv('./data/train_df_day_night_together.csv') test_df = pd.read_csv('./data/test_df_day_night_together.csv') from together_fn_param import list_param elif task == 'split': train_df = pd.read_csv('./data/train_df_day_night_split.csv') test_df = pd.read_csv('./data/test_df_day_night_split.csv') from split_fn_param import list_param train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) print("Data loading Done!") target = 'label' predictors = train_df.columns.values.tolist()[1:-1] categorical = None gc.collect() #lightgbm X_train = train_df[predictors].values labels = train_df['label'] def xg_f1(preds, train_data): yhat = preds dtrain = train_data y = dtrain.get_label() pre, rec, th = metrics.precision_recall_curve(y, yhat) f1_all = 2 / ((1 / rec) + (1 / pre)) optimal_idx = np.argmax(f1_all) optimal_thresholds = th[optimal_idx] y_bin = [1. if y_cont > optimal_thresholds else 0. for y_cont in yhat] # binaryzing your output tn, fp, fn, tp = confusion_matrix(y, y_bin).ravel() specificity = tn / (tn + fp) sensitivity = tp / (tp + fn) optimal_f1 = np.nanmax(f1_all) return 'f1', -optimal_f1, False xg_train = lgb.Dataset(train_df[predictors].values, label=train_df[target].values, feature_name=predictors) seeds = np.random.randint(5000, 10000, size=10).tolist() auc_lst = [] auc_lst1 = [] n_estimators_lst = [] stratified = True debug = True param = list_param('lgb_gbdt') oof_preds_folds = np.zeros((train_df.shape[0], len(seeds))) sub_preds_folds = np.zeros((test_df.shape[0], len(seeds))) sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds))) oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds))) feature_importance_df_folds = pd.DataFrame() list_thresholds_global = [] for seed_id in range(len(seeds)): if stratified: folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seeds[seed_id]) else: folds = KFold(n_splits=nfold, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) oof_preds_local_vote = np.zeros(train_df.shape[0]) sub_preds_local_vote = np.zeros((test_df.shape[0], nfold)) feature_importance_df = pd.DataFrame() gfold_Id = list(folds.split(X_train, labels)) params_iter = { 'max_bin': 63, # fixed #int 'save_binary': True, # fixed 'seed': seeds[seed_id], 'feature_fraction_seed': seeds[seed_id], 'bagging_seed': seeds[seed_id], 'drop_seed': seeds[seed_id], 'data_random_seed': seeds[seed_id], 'objective': 'binary', 'boosting_type': 'gbdt', 'verbose': 1, 'metric': 'auc', } param.update(params_iter) bst1 = lgb.cv(param, xg_train, num_boost_round=5000, early_stopping_rounds=50, folds=gfold_Id) res0 = pd.DataFrame(bst1) n_estimators = res0.shape[0] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, labels)): xgg_train = lgb.Dataset(data=train_df[predictors].iloc[train_idx], label=train_df[target].iloc[train_idx], free_raw_data=False, silent=True) xgg_valid = lgb.Dataset(data=train_df[predictors].iloc[valid_idx], label=train_df[target].iloc[valid_idx], free_raw_data=False, silent=True) clf = lgb.train( param, xgg_train, num_boost_round=n_estimators, # fobj=loglikelood, # feval=binary_error, verbose_eval=1, ) oof_preds[valid_idx] = clf.predict(xgg_valid.data) pred = clf.predict(test_df[predictors]) sub_preds += pred / folds.n_splits fpr, tpr, thresholds = metrics.roc_curve(xgg_valid.label, oof_preds[valid_idx]) optimal_idx = np.argmax(tpr - fpr) optimal_thresholds = thresholds[optimal_idx] list_thresholds_global.append(optimal_thresholds) sub_preds_local_vote[:, n_fold] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in pred ] oof_preds_local_vote[valid_idx] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in oof_preds[valid_idx] ] fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = clf.feature_name() fold_importance_df["importance"] = clf.feature_importance( importance_type='gain') fold_importance_df = fold_importance_df.fillna(value=0) fold_importance_df = fold_importance_df.sort_values( 'importance', ascending=False) fold_importance_df["fold"] = n_fold + 1 fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id]) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(xgg_valid.label, oof_preds[valid_idx]))) del clf, xgg_train, xgg_valid gc.collect() oof_preds_folds[:, seed_id] = oof_preds sub_preds_folds[:, seed_id] = sub_preds from scipy import stats a, b = stats.mode(sub_preds_local_vote, axis=1) oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote sub_preds_folds_vote[:, seed_id] = a.reshape(-1) feature_importance_df_folds = pd.concat( [feature_importance_df_folds, feature_importance_df], axis=0) auc_lst.append(ml_metrics.auc(xg_train.label, oof_preds)) auc_lst1.append(roc_auc_score(xg_train.label, oof_preds)) print('Full AUC score %.6f' % roc_auc_score(xg_train.label, oof_preds)) print("auc_lst1") print(auc_lst1) print(list_thresholds_global) #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))]) #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))]) oof_preds_folds_vote = pd.DataFrame( oof_preds_folds_vote, columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))]) sub_preds_folds_vote = pd.DataFrame( sub_preds_folds_vote, columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))]) #oof_preds_folds.to_csv("./output/" + task + "_train_stack/lgb_gbt.csv", index=False) #sub_preds_folds.to_csv("./output/" + task + "_test_stack/lgb_gbt.csv", index=False) oof_preds_folds_vote.to_csv("./output/" + task + "_train_stack_vote/lgb_gbt.csv", index=False) sub_preds_folds_vote.to_csv("./output/" + task + "_test_stack_vote/lgb_gbt.csv", index=False) feature_importance_df_folds = feature_importance_df_folds.sort_values( 'importance', ascending=False) feature_importance_df_folds.to_csv("./output/" + task + "_feature/lgb_gbt.csv", index=False)
def stacking_model_xgb_rank(task='together'): nfold = 5 #task='together' train_df = None test_df = None if task == 'together': train_df = pd.read_csv('./data/train_df_day_night_together.csv') test_df = pd.read_csv('./data/test_df_day_night_together.csv') from together_fn_param import list_param elif task == 'split': train_df = pd.read_csv('./data/train_df_day_night_split.csv') test_df = pd.read_csv('./data/test_df_day_night_split.csv') from split_fn_param import list_param train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) print("Data loading Done!") X_train = train_df.drop(['bird_id', 'label'], axis=1) features = X_train.columns labels = train_df['label'] X_train = X_train.fillna(-1) y_train = np.int32(labels) X_test = test_df.drop(['bird_id', 'label'], axis=1) X_test = X_test.fillna(-1) #xgboost xg_train = xgb.DMatrix(X_train, label=y_train, missing=-1.0) xg_test = xgb.DMatrix(X_test, missing=-1.0) def xg_f1(yhat, dtrain): y = dtrain.get_label() pre, rec, th = metrics.precision_recall_curve(y, yhat) f1_all = 2 / ((1 / rec) + (1 / pre)) optimal_idx = np.argmax(f1_all) optimal_thresholds = th[optimal_idx] y_bin = [1. if y_cont > optimal_thresholds else 0. for y_cont in yhat] # binaryzing your output tn, fp, fn, tp = confusion_matrix(y, y_bin).ravel() specificity = tn / (tn + fp) sensitivity = tp / (tp + fn) optimal_f1 = np.nanmax(f1_all) return 'f1', -optimal_f1 seeds = np.random.randint(5000, 10000, size=10).tolist() auc_lst = [] auc_lst1 = [] n_estimators_lst = [] stratified = True debug = True param = list_param('xgb_rank') oof_preds_folds = np.zeros((train_df.shape[0], len(seeds))) sub_preds_folds = np.zeros((test_df.shape[0], len(seeds))) sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds))) oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds))) feature_importance_df_folds = pd.DataFrame() list_thresholds_global = [] for seed_id in range(len(seeds)): if stratified: folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seeds[seed_id]) else: folds = KFold(n_splits=nfold, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) oof_preds_local_vote = np.zeros(train_df.shape[0]) sub_preds_local_vote = np.zeros((test_df.shape[0], nfold)) feature_importance_df = pd.DataFrame() gfold_Id = list(folds.split(X_train, labels)) params_iter = { 'seed': seeds[seed_id], 'objective': 'rank:pairwise', 'silent': False, } param.update(params_iter) res = xgb.cv(param, xg_train, num_boost_round=5000, folds=gfold_Id, feval=xg_f1, metrics={'auc'}, stratified=True, maximize=False, verbose_eval=50, callbacks=[ xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(50) ]) n_estimators = res.shape[0] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, labels)): xgg_train = X_train.iloc[train_idx] xgg_valid = X_train.iloc[valid_idx] ygg_train = labels[train_idx] ygg_valid = labels[valid_idx] xgg_train = xgb.DMatrix(xgg_train, label=ygg_train, missing=-1.0) xgg_valid = xgb.DMatrix(xgg_valid, missing=-1.0) #xg_test = xgb.DMatrix(X_test, missing=-1.0) clf = xgb.train(param, xgg_train, num_boost_round=n_estimators, verbose_eval=1) oof_preds[valid_idx] = clf.predict(xgg_valid) pred = clf.predict(xg_test) sub_preds += pred / folds.n_splits fpr, tpr, thresholds = metrics.roc_curve(ygg_valid, oof_preds[valid_idx]) optimal_idx = np.argmax(tpr - fpr) optimal_thresholds = thresholds[optimal_idx] list_thresholds_global.append(optimal_thresholds) sub_preds_local_vote[:, n_fold] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in pred ] oof_preds_local_vote[valid_idx] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in oof_preds[valid_idx] ] fold_raw_importance = pd.DataFrame( list(clf.get_score(importance_type='gain').items()), columns=['feature', 'importance']).sort_values('importance', ascending=False) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = features fold_importance_df = pd.merge(fold_importance_df, fold_raw_importance, on='feature', how='left') fold_importance_df = fold_importance_df.fillna(value=0) fold_importance_df = fold_importance_df.sort_values( 'importance', ascending=False) fold_importance_df["fold"] = n_fold + 1 fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id]) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(ygg_valid, oof_preds[valid_idx]))) del clf, xgg_train, xgg_valid, ygg_train, ygg_valid gc.collect() oof_preds_folds[:, seed_id] = oof_preds sub_preds_folds[:, seed_id] = sub_preds from scipy import stats a, b = stats.mode(sub_preds_local_vote, axis=1) oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote sub_preds_folds_vote[:, seed_id] = a.reshape(-1) feature_importance_df_folds = pd.concat( [feature_importance_df_folds, feature_importance_df], axis=0) auc_lst.append(ml_metrics.auc(y_train, oof_preds)) auc_lst1.append(roc_auc_score(y_train, oof_preds)) print('Full AUC score %.6f' % roc_auc_score(y_train, oof_preds)) print("auc_lst1") print(auc_lst1) print(list_thresholds_global) #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))]) #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))]) oof_preds_folds_vote = pd.DataFrame( oof_preds_folds_vote, columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))]) sub_preds_folds_vote = pd.DataFrame( sub_preds_folds_vote, columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))]) #oof_preds_folds.to_csv("./output/" + task + "_train_stack/xgb_rank.csv", index=False) #sub_preds_folds.to_csv("./output/" + task + "_test_stack/xgb_rank.csv", index=False) oof_preds_folds_vote.to_csv("./output/" + task + "_train_stack_vote/xgb_rank.csv", index=False) sub_preds_folds_vote.to_csv("./output/" + task + "_test_stack_vote/xgb_rank.csv", index=False) feature_importance_df_folds = feature_importance_df_folds.sort_values( 'importance', ascending=False) feature_importance_df_folds.to_csv("./output/" + task + "_feature/xgb_rank.csv", index=False)
def stacking_model_sk_svc(task='together'): nfold = 5 #task='together' train_df = None test_df = None if task == 'together': train_df = pd.read_csv('./data/train_df_day_night_together.csv') test_df = pd.read_csv('./data/test_df_day_night_together.csv') from together_fn_param import list_param elif task == 'split': train_df = pd.read_csv('./data/train_df_day_night_split.csv') test_df = pd.read_csv('./data/test_df_day_night_split.csv') from split_fn_param import list_param train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) print("Data loading Done!") target = 'label' predictors = train_df.columns.values.tolist()[1:-1] categorical = None X_train = train_df[predictors].values X_test = test_df[predictors].values labels = train_df['label'] scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) seeds = np.random.randint(5000, 10000, size=10).tolist() auc_lst = [] auc_lst1 = [] n_estimators_lst = [] stratified = True debug = True param = list_param('sk_svc') oof_preds_folds = np.zeros((train_df.shape[0], len(seeds))) sub_preds_folds = np.zeros((test_df.shape[0], len(seeds))) sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds))) oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds))) feature_importance_df_folds = pd.DataFrame() list_thresholds_global = [] for seed_id in range(len(seeds)): if stratified: folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seeds[seed_id]) else: folds = KFold(n_splits=nfold, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) oof_preds_local_vote = np.zeros(train_df.shape[0]) sub_preds_local_vote = np.zeros((test_df.shape[0], nfold)) feature_importance_df = pd.DataFrame() gfold_Id = list(folds.split(X_train, labels)) params_iter = {'random_state': seeds[seed_id]} param.update(params_iter) clf = SVC( C=param['C'], kernel='rbf', gamma=param['gamma'], shrinking=True, probability=True, tol=param['tol'], # 0.001,#may be 0.0001 for stoping criteria max_iter=int(param['max_iter']), verbose=False, decision_function_shape='ovr', random_state=seeds[seed_id]) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, labels)): xtrain, xtest = X_train[train_idx, :], X_train[valid_idx, :] ytrain, ytest = labels[train_idx], labels[valid_idx] clf.fit(xtrain, ytrain) oof_preds[valid_idx] = clf.predict_proba(xtest)[:, 1] pred = clf.predict_proba(X_test)[:, 1] sub_preds += pred / folds.n_splits fpr, tpr, thresholds = metrics.roc_curve( train_df[target].iloc[valid_idx], oof_preds[valid_idx]) optimal_idx = np.argmax(tpr - fpr) optimal_thresholds = thresholds[optimal_idx] list_thresholds_global.append(optimal_thresholds) sub_preds_local_vote[:, n_fold] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in pred ] oof_preds_local_vote[valid_idx] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in oof_preds[valid_idx] ] print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(ytest, oof_preds[valid_idx]))) del xtrain, xtest, ytrain, ytest gc.collect() oof_preds_folds[:, seed_id] = oof_preds sub_preds_folds[:, seed_id] = sub_preds from scipy import stats a, b = stats.mode(sub_preds_local_vote, axis=1) oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote sub_preds_folds_vote[:, seed_id] = a.reshape(-1) feature_importance_df_folds = pd.concat( [feature_importance_df_folds, feature_importance_df], axis=0) auc_lst.append(ml_metrics.auc(train_df[target], oof_preds)) auc_lst1.append(roc_auc_score(train_df[target], oof_preds)) print('Full AUC score %.6f' % roc_auc_score(train_df[target], oof_preds)) print("auc_lst1") print(auc_lst1) print(list_thresholds_global) #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))]) #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))]) oof_preds_folds_vote = pd.DataFrame( oof_preds_folds_vote, columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))]) sub_preds_folds_vote = pd.DataFrame( sub_preds_folds_vote, columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))]) #oof_preds_folds.to_csv("../" + task + "_train_stack/sk_svc.csv", index=False) #sub_preds_folds.to_csv("../" + task + "_test_stack/sk_svc.csv", index=False) oof_preds_folds_vote.to_csv("./output/" + task + "_train_stack_vote/sk_svc.csv", index=False) sub_preds_folds_vote.to_csv("./output/" + task + "_test_stack_vote/sk_svc.csv", index=False)
def stacking_model_cat(task='together'): nfold = 5 #task='together' train_df = None test_df = None if task == 'together': train_df = pd.read_csv('./data/train_df_day_night_together.csv') test_df = pd.read_csv('./data/test_df_day_night_together.csv') from together_fn_param import list_param elif task == 'split': train_df = pd.read_csv('./data/train_df_day_night_split.csv') test_df = pd.read_csv('./data/test_df_day_night_split.csv') from split_fn_param import list_param train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) print("Data loading Done!") target = 'label' predictors = train_df.columns.values.tolist()[1:-1] categorical = None X_train = train_df.drop(['bird_id', 'label'], axis=1) labels = train_df['label'] #cat seeds = np.random.randint(5000, 10000, size=10).tolist() auc_lst = [] auc_lst1 = [] n_estimators_lst = [] stratified = True debug = True param = list_param('cat') oof_preds_folds = np.zeros((train_df.shape[0], len(seeds))) sub_preds_folds = np.zeros((test_df.shape[0], len(seeds))) sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds))) oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds))) feature_importance_df_folds = pd.DataFrame() list_thresholds_global = [] for seed_id in range(len(seeds)): if stratified: folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seeds[seed_id]) else: folds = KFold(n_splits=nfold, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) oof_preds_local_vote = np.zeros(train_df.shape[0]) sub_preds_local_vote = np.zeros((test_df.shape[0], nfold)) feature_importance_df = pd.DataFrame() gfold_Id = list(folds.split(X_train, labels)) params_iter = { 'iterations': 5000, # int 'border_count': 128, # (128) 1 - 255 'bootstrap_type': 'Bernoulli', 'loss_function': 'Logloss', 'eval_metric': 'F1', # 'AUC', 'od_type': 'Iter', 'allow_writing_files': False, 'early_stopping_rounds': 50, 'custom_metric': ['AUC'], 'random_seed': seeds[seed_id], 'use_best_model': True } param.update(params_iter) pool = ctb.Pool(train_df[predictors], train_df[target]) bst1 = ctb.cv(pool=pool, params=param, fold_count=10, partition_random_seed=seeds[seed_id], stratified=True) res0 = pd.DataFrame(bst1) n_estimators = res0['test-F1-mean'].argmax() + 1 params_iter2 = { 'iterations': n_estimators, } param.update(params_iter2) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, labels)): if 'use_best_model' in param: param.__delitem__("use_best_model") pool_0 = ctb.Pool(train_df[predictors].iloc[train_idx], train_df[target].iloc[train_idx]) clf = ctb.train(pool=pool_0, params=param) #oof_preds[valid_idx] = clf.predict(train_df[predictors].iloc[valid_idx], prediction_type='Probability')[:, 1] #sub_preds += (clf.predict(test_df[predictors], prediction_type='Probability')[:, 1]) / folds.n_splits oof_preds[valid_idx] = clf.predict( train_df[predictors].iloc[valid_idx], prediction_type='Probability')[:, 1] pred = clf.predict(test_df[predictors], prediction_type='Probability')[:, 1] sub_preds += pred / folds.n_splits fpr, tpr, thresholds = metrics.roc_curve( train_df[target].iloc[valid_idx], oof_preds[valid_idx]) optimal_idx = np.argmax(tpr - fpr) optimal_thresholds = thresholds[optimal_idx] list_thresholds_global.append(optimal_thresholds) sub_preds_local_vote[:, n_fold] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in pred ] oof_preds_local_vote[valid_idx] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in oof_preds[valid_idx] ] fold_importance_df = pd.DataFrame( list( zip(train_df[predictors].iloc[train_idx].dtypes.index, clf.get_feature_importance(pool_0))), columns=['feature', 'importance']) fold_importance_df = fold_importance_df.sort_values( by='importance', ascending=False, inplace=False, kind='quicksort', na_position='last') fold_importance_df["fold"] = n_fold + 1 fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id]) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(train_df[target].iloc[valid_idx], oof_preds[valid_idx]))) del clf, pool_0 gc.collect() oof_preds_folds[:, seed_id] = oof_preds sub_preds_folds[:, seed_id] = sub_preds from scipy import stats a, b = stats.mode(sub_preds_local_vote, axis=1) oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote sub_preds_folds_vote[:, seed_id] = a.reshape(-1) feature_importance_df_folds = pd.concat( [feature_importance_df_folds, feature_importance_df], axis=0) auc_lst.append(ml_metrics.auc(train_df[target], oof_preds)) auc_lst1.append(roc_auc_score(train_df[target], oof_preds)) print('Full AUC score %.6f' % roc_auc_score(train_df[target], oof_preds)) print("auc_lst1") print(auc_lst1) print(list_thresholds_global) #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))]) #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))]) oof_preds_folds_vote = pd.DataFrame( oof_preds_folds_vote, columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))]) sub_preds_folds_vote = pd.DataFrame( sub_preds_folds_vote, columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))]) #oof_preds_folds.to_csv("./output/" + task + "_train_stack/cat.csv", index=False) #sub_preds_folds.to_csv("./output/" + task + "_test_stack/cat.csv", index=False) oof_preds_folds_vote.to_csv("./output/" + task + "_train_stack_vote/cat.csv", index=False) sub_preds_folds_vote.to_csv("./output/" + task + "_test_stack_vote/cat.csv", index=False) feature_importance_df_folds = feature_importance_df_folds.sort_values( 'importance', ascending=False) feature_importance_df_folds.to_csv("./output/" + task + "_feature/cat.csv", index=False)