def stacking_model_lgb_gbt(task='together'):
    nfold = 5
    #task='together'

    train_df = None
    test_df = None

    if task == 'together':
        train_df = pd.read_csv('./data/train_df_day_night_together.csv')
        test_df = pd.read_csv('./data/test_df_day_night_together.csv')
        from together_fn_param import list_param
    elif task == 'split':
        train_df = pd.read_csv('./data/train_df_day_night_split.csv')
        test_df = pd.read_csv('./data/test_df_day_night_split.csv')
        from split_fn_param import list_param

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    print("Data loading Done!")

    target = 'label'
    predictors = train_df.columns.values.tolist()[1:-1]
    categorical = None

    gc.collect()

    #lightgbm
    X_train = train_df[predictors].values

    labels = train_df['label']

    def xg_f1(preds, train_data):
        yhat = preds

        dtrain = train_data

        y = dtrain.get_label()

        pre, rec, th = metrics.precision_recall_curve(y, yhat)

        f1_all = 2 / ((1 / rec) + (1 / pre))
        optimal_idx = np.argmax(f1_all)
        optimal_thresholds = th[optimal_idx]
        y_bin = [1. if y_cont > optimal_thresholds else 0.
                 for y_cont in yhat]  # binaryzing your output
        tn, fp, fn, tp = confusion_matrix(y, y_bin).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)
        optimal_f1 = np.nanmax(f1_all)

        return 'f1', -optimal_f1, False

    xg_train = lgb.Dataset(train_df[predictors].values,
                           label=train_df[target].values,
                           feature_name=predictors)

    seeds = np.random.randint(5000, 10000, size=10).tolist()
    auc_lst = []
    auc_lst1 = []
    n_estimators_lst = []
    stratified = True
    debug = True
    param = list_param('lgb_gbdt')
    oof_preds_folds = np.zeros((train_df.shape[0], len(seeds)))
    sub_preds_folds = np.zeros((test_df.shape[0], len(seeds)))
    sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds)))
    oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds)))
    feature_importance_df_folds = pd.DataFrame()

    list_thresholds_global = []
    for seed_id in range(len(seeds)):
        if stratified:
            folds = StratifiedKFold(n_splits=nfold,
                                    shuffle=True,
                                    random_state=seeds[seed_id])
        else:
            folds = KFold(n_splits=nfold, shuffle=True, random_state=1001)
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        oof_preds_local_vote = np.zeros(train_df.shape[0])
        sub_preds_local_vote = np.zeros((test_df.shape[0], nfold))
        feature_importance_df = pd.DataFrame()

        gfold_Id = list(folds.split(X_train, labels))

        params_iter = {
            'max_bin': 63,  # fixed #int
            'save_binary': True,  # fixed
            'seed': seeds[seed_id],
            'feature_fraction_seed': seeds[seed_id],
            'bagging_seed': seeds[seed_id],
            'drop_seed': seeds[seed_id],
            'data_random_seed': seeds[seed_id],
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbose': 1,
            'metric': 'auc',
        }
        param.update(params_iter)

        bst1 = lgb.cv(param,
                      xg_train,
                      num_boost_round=5000,
                      early_stopping_rounds=50,
                      folds=gfold_Id)

        res0 = pd.DataFrame(bst1)

        n_estimators = res0.shape[0]

        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(X_train, labels)):
            xgg_train = lgb.Dataset(data=train_df[predictors].iloc[train_idx],
                                    label=train_df[target].iloc[train_idx],
                                    free_raw_data=False,
                                    silent=True)
            xgg_valid = lgb.Dataset(data=train_df[predictors].iloc[valid_idx],
                                    label=train_df[target].iloc[valid_idx],
                                    free_raw_data=False,
                                    silent=True)

            clf = lgb.train(
                param,
                xgg_train,
                num_boost_round=n_estimators,
                # fobj=loglikelood,
                # feval=binary_error,
                verbose_eval=1,
            )

            oof_preds[valid_idx] = clf.predict(xgg_valid.data)
            pred = clf.predict(test_df[predictors])
            sub_preds += pred / folds.n_splits

            fpr, tpr, thresholds = metrics.roc_curve(xgg_valid.label,
                                                     oof_preds[valid_idx])
            optimal_idx = np.argmax(tpr - fpr)
            optimal_thresholds = thresholds[optimal_idx]

            list_thresholds_global.append(optimal_thresholds)

            sub_preds_local_vote[:, n_fold] = [
                1 if y_cont > optimal_thresholds else 0 for y_cont in pred
            ]
            oof_preds_local_vote[valid_idx] = [
                1 if y_cont > optimal_thresholds else 0
                for y_cont in oof_preds[valid_idx]
            ]

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = clf.feature_name()
            fold_importance_df["importance"] = clf.feature_importance(
                importance_type='gain')
            fold_importance_df = fold_importance_df.fillna(value=0)
            fold_importance_df = fold_importance_df.sort_values(
                'importance', ascending=False)
            fold_importance_df["fold"] = n_fold + 1
            fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id])
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1,
                   roc_auc_score(xgg_valid.label, oof_preds[valid_idx])))

            del clf, xgg_train, xgg_valid
            gc.collect()

        oof_preds_folds[:, seed_id] = oof_preds
        sub_preds_folds[:, seed_id] = sub_preds
        from scipy import stats

        a, b = stats.mode(sub_preds_local_vote, axis=1)
        oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote
        sub_preds_folds_vote[:, seed_id] = a.reshape(-1)
        feature_importance_df_folds = pd.concat(
            [feature_importance_df_folds, feature_importance_df], axis=0)
        auc_lst.append(ml_metrics.auc(xg_train.label, oof_preds))
        auc_lst1.append(roc_auc_score(xg_train.label, oof_preds))
        print('Full AUC score %.6f' % roc_auc_score(xg_train.label, oof_preds))
        print("auc_lst1")
        print(auc_lst1)

    print(list_thresholds_global)
    #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))])
    #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))])
    oof_preds_folds_vote = pd.DataFrame(
        oof_preds_folds_vote,
        columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))])
    sub_preds_folds_vote = pd.DataFrame(
        sub_preds_folds_vote,
        columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))])

    #oof_preds_folds.to_csv("./output/" + task + "_train_stack/lgb_gbt.csv", index=False)
    #sub_preds_folds.to_csv("./output/" + task + "_test_stack/lgb_gbt.csv", index=False)
    oof_preds_folds_vote.to_csv("./output/" + task +
                                "_train_stack_vote/lgb_gbt.csv",
                                index=False)
    sub_preds_folds_vote.to_csv("./output/" + task +
                                "_test_stack_vote/lgb_gbt.csv",
                                index=False)
    feature_importance_df_folds = feature_importance_df_folds.sort_values(
        'importance', ascending=False)
    feature_importance_df_folds.to_csv("./output/" + task +
                                       "_feature/lgb_gbt.csv",
                                       index=False)
def stacking_model_xgb_rank(task='together'):
    nfold = 5
    #task='together'

    train_df = None
    test_df = None

    if task == 'together':
        train_df = pd.read_csv('./data/train_df_day_night_together.csv')
        test_df = pd.read_csv('./data/test_df_day_night_together.csv')
        from together_fn_param import list_param
    elif task == 'split':
        train_df = pd.read_csv('./data/train_df_day_night_split.csv')
        test_df = pd.read_csv('./data/test_df_day_night_split.csv')
        from split_fn_param import list_param

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    print("Data loading Done!")

    X_train = train_df.drop(['bird_id', 'label'], axis=1)
    features = X_train.columns
    labels = train_df['label']
    X_train = X_train.fillna(-1)
    y_train = np.int32(labels)

    X_test = test_df.drop(['bird_id', 'label'], axis=1)
    X_test = X_test.fillna(-1)

    #xgboost

    xg_train = xgb.DMatrix(X_train, label=y_train, missing=-1.0)
    xg_test = xgb.DMatrix(X_test, missing=-1.0)

    def xg_f1(yhat, dtrain):
        y = dtrain.get_label()

        pre, rec, th = metrics.precision_recall_curve(y, yhat)

        f1_all = 2 / ((1 / rec) + (1 / pre))
        optimal_idx = np.argmax(f1_all)
        optimal_thresholds = th[optimal_idx]
        y_bin = [1. if y_cont > optimal_thresholds else 0.
                 for y_cont in yhat]  # binaryzing your output
        tn, fp, fn, tp = confusion_matrix(y, y_bin).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)
        optimal_f1 = np.nanmax(f1_all)

        return 'f1', -optimal_f1

    seeds = np.random.randint(5000, 10000, size=10).tolist()
    auc_lst = []
    auc_lst1 = []
    n_estimators_lst = []
    stratified = True
    debug = True
    param = list_param('xgb_rank')
    oof_preds_folds = np.zeros((train_df.shape[0], len(seeds)))
    sub_preds_folds = np.zeros((test_df.shape[0], len(seeds)))
    sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds)))
    oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds)))
    feature_importance_df_folds = pd.DataFrame()

    list_thresholds_global = []
    for seed_id in range(len(seeds)):
        if stratified:
            folds = StratifiedKFold(n_splits=nfold,
                                    shuffle=True,
                                    random_state=seeds[seed_id])
        else:
            folds = KFold(n_splits=nfold, shuffle=True, random_state=1001)
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        oof_preds_local_vote = np.zeros(train_df.shape[0])
        sub_preds_local_vote = np.zeros((test_df.shape[0], nfold))
        feature_importance_df = pd.DataFrame()

        gfold_Id = list(folds.split(X_train, labels))
        params_iter = {
            'seed': seeds[seed_id],
            'objective': 'rank:pairwise',
            'silent': False,
        }
        param.update(params_iter)

        res = xgb.cv(param,
                     xg_train,
                     num_boost_round=5000,
                     folds=gfold_Id,
                     feval=xg_f1,
                     metrics={'auc'},
                     stratified=True,
                     maximize=False,
                     verbose_eval=50,
                     callbacks=[
                         xgb.callback.print_evaluation(show_stdv=True),
                         xgb.callback.early_stop(50)
                     ])

        n_estimators = res.shape[0]

        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(X_train, labels)):
            xgg_train = X_train.iloc[train_idx]
            xgg_valid = X_train.iloc[valid_idx]
            ygg_train = labels[train_idx]
            ygg_valid = labels[valid_idx]

            xgg_train = xgb.DMatrix(xgg_train, label=ygg_train, missing=-1.0)
            xgg_valid = xgb.DMatrix(xgg_valid, missing=-1.0)
            #xg_test = xgb.DMatrix(X_test, missing=-1.0)

            clf = xgb.train(param,
                            xgg_train,
                            num_boost_round=n_estimators,
                            verbose_eval=1)

            oof_preds[valid_idx] = clf.predict(xgg_valid)
            pred = clf.predict(xg_test)
            sub_preds += pred / folds.n_splits

            fpr, tpr, thresholds = metrics.roc_curve(ygg_valid,
                                                     oof_preds[valid_idx])
            optimal_idx = np.argmax(tpr - fpr)
            optimal_thresholds = thresholds[optimal_idx]

            list_thresholds_global.append(optimal_thresholds)

            sub_preds_local_vote[:, n_fold] = [
                1 if y_cont > optimal_thresholds else 0 for y_cont in pred
            ]
            oof_preds_local_vote[valid_idx] = [
                1 if y_cont > optimal_thresholds else 0
                for y_cont in oof_preds[valid_idx]
            ]

            fold_raw_importance = pd.DataFrame(
                list(clf.get_score(importance_type='gain').items()),
                columns=['feature', 'importance']).sort_values('importance',
                                                               ascending=False)
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = features
            fold_importance_df = pd.merge(fold_importance_df,
                                          fold_raw_importance,
                                          on='feature',
                                          how='left')
            fold_importance_df = fold_importance_df.fillna(value=0)
            fold_importance_df = fold_importance_df.sort_values(
                'importance', ascending=False)
            fold_importance_df["fold"] = n_fold + 1
            fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id])
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1, roc_auc_score(ygg_valid, oof_preds[valid_idx])))

            del clf, xgg_train, xgg_valid, ygg_train, ygg_valid
            gc.collect()

        oof_preds_folds[:, seed_id] = oof_preds
        sub_preds_folds[:, seed_id] = sub_preds
        from scipy import stats

        a, b = stats.mode(sub_preds_local_vote, axis=1)
        oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote
        sub_preds_folds_vote[:, seed_id] = a.reshape(-1)
        feature_importance_df_folds = pd.concat(
            [feature_importance_df_folds, feature_importance_df], axis=0)
        auc_lst.append(ml_metrics.auc(y_train, oof_preds))
        auc_lst1.append(roc_auc_score(y_train, oof_preds))
        print('Full AUC score %.6f' % roc_auc_score(y_train, oof_preds))
        print("auc_lst1")
        print(auc_lst1)

    print(list_thresholds_global)
    #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))])
    #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))])
    oof_preds_folds_vote = pd.DataFrame(
        oof_preds_folds_vote,
        columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))])
    sub_preds_folds_vote = pd.DataFrame(
        sub_preds_folds_vote,
        columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))])

    #oof_preds_folds.to_csv("./output/" + task + "_train_stack/xgb_rank.csv", index=False)
    #sub_preds_folds.to_csv("./output/" + task + "_test_stack/xgb_rank.csv", index=False)
    oof_preds_folds_vote.to_csv("./output/" + task +
                                "_train_stack_vote/xgb_rank.csv",
                                index=False)
    sub_preds_folds_vote.to_csv("./output/" + task +
                                "_test_stack_vote/xgb_rank.csv",
                                index=False)
    feature_importance_df_folds = feature_importance_df_folds.sort_values(
        'importance', ascending=False)
    feature_importance_df_folds.to_csv("./output/" + task +
                                       "_feature/xgb_rank.csv",
                                       index=False)
예제 #3
0
def stacking_model_sk_svc(task='together'):
    nfold = 5
    #task='together'

    train_df = None
    test_df = None

    if task == 'together':
        train_df = pd.read_csv('./data/train_df_day_night_together.csv')
        test_df = pd.read_csv('./data/test_df_day_night_together.csv')
        from together_fn_param import list_param
    elif task == 'split':
        train_df = pd.read_csv('./data/train_df_day_night_split.csv')
        test_df = pd.read_csv('./data/test_df_day_night_split.csv')
        from split_fn_param import list_param

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    print("Data loading Done!")
    target = 'label'
    predictors = train_df.columns.values.tolist()[1:-1]
    categorical = None

    X_train = train_df[predictors].values
    X_test = test_df[predictors].values
    labels = train_df['label']
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    seeds = np.random.randint(5000, 10000, size=10).tolist()
    auc_lst = []
    auc_lst1 = []
    n_estimators_lst = []
    stratified = True
    debug = True
    param = list_param('sk_svc')
    oof_preds_folds = np.zeros((train_df.shape[0], len(seeds)))
    sub_preds_folds = np.zeros((test_df.shape[0], len(seeds)))
    sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds)))
    oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds)))
    feature_importance_df_folds = pd.DataFrame()

    list_thresholds_global = []
    for seed_id in range(len(seeds)):
        if stratified:
            folds = StratifiedKFold(n_splits=nfold,
                                    shuffle=True,
                                    random_state=seeds[seed_id])
        else:
            folds = KFold(n_splits=nfold, shuffle=True, random_state=1001)
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        oof_preds_local_vote = np.zeros(train_df.shape[0])
        sub_preds_local_vote = np.zeros((test_df.shape[0], nfold))
        feature_importance_df = pd.DataFrame()

        gfold_Id = list(folds.split(X_train, labels))
        params_iter = {'random_state': seeds[seed_id]}
        param.update(params_iter)

        clf = SVC(
            C=param['C'],
            kernel='rbf',
            gamma=param['gamma'],
            shrinking=True,
            probability=True,
            tol=param['tol'],  # 0.001,#may be 0.0001 for stoping criteria
            max_iter=int(param['max_iter']),
            verbose=False,
            decision_function_shape='ovr',
            random_state=seeds[seed_id])

        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(X_train, labels)):
            xtrain, xtest = X_train[train_idx, :], X_train[valid_idx, :]
            ytrain, ytest = labels[train_idx], labels[valid_idx]

            clf.fit(xtrain, ytrain)

            oof_preds[valid_idx] = clf.predict_proba(xtest)[:, 1]
            pred = clf.predict_proba(X_test)[:, 1]
            sub_preds += pred / folds.n_splits

            fpr, tpr, thresholds = metrics.roc_curve(
                train_df[target].iloc[valid_idx], oof_preds[valid_idx])
            optimal_idx = np.argmax(tpr - fpr)
            optimal_thresholds = thresholds[optimal_idx]

            list_thresholds_global.append(optimal_thresholds)

            sub_preds_local_vote[:, n_fold] = [
                1 if y_cont > optimal_thresholds else 0 for y_cont in pred
            ]
            oof_preds_local_vote[valid_idx] = [
                1 if y_cont > optimal_thresholds else 0
                for y_cont in oof_preds[valid_idx]
            ]

            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1, roc_auc_score(ytest, oof_preds[valid_idx])))

            del xtrain, xtest, ytrain, ytest
            gc.collect()

        oof_preds_folds[:, seed_id] = oof_preds
        sub_preds_folds[:, seed_id] = sub_preds
        from scipy import stats

        a, b = stats.mode(sub_preds_local_vote, axis=1)
        oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote
        sub_preds_folds_vote[:, seed_id] = a.reshape(-1)
        feature_importance_df_folds = pd.concat(
            [feature_importance_df_folds, feature_importance_df], axis=0)
        auc_lst.append(ml_metrics.auc(train_df[target], oof_preds))
        auc_lst1.append(roc_auc_score(train_df[target], oof_preds))
        print('Full AUC score %.6f' %
              roc_auc_score(train_df[target], oof_preds))
        print("auc_lst1")
        print(auc_lst1)

    print(list_thresholds_global)
    #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))])
    #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))])
    oof_preds_folds_vote = pd.DataFrame(
        oof_preds_folds_vote,
        columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))])
    sub_preds_folds_vote = pd.DataFrame(
        sub_preds_folds_vote,
        columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))])

    #oof_preds_folds.to_csv("../" + task + "_train_stack/sk_svc.csv", index=False)
    #sub_preds_folds.to_csv("../" + task + "_test_stack/sk_svc.csv", index=False)
    oof_preds_folds_vote.to_csv("./output/" + task +
                                "_train_stack_vote/sk_svc.csv",
                                index=False)
    sub_preds_folds_vote.to_csv("./output/" + task +
                                "_test_stack_vote/sk_svc.csv",
                                index=False)
def stacking_model_cat(task='together'):
    nfold = 5
    #task='together'

    train_df = None
    test_df = None

    if task == 'together':
        train_df = pd.read_csv('./data/train_df_day_night_together.csv')
        test_df = pd.read_csv('./data/test_df_day_night_together.csv')
        from together_fn_param import list_param
    elif task == 'split':
        train_df = pd.read_csv('./data/train_df_day_night_split.csv')
        test_df = pd.read_csv('./data/test_df_day_night_split.csv')
        from split_fn_param import list_param

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    print("Data loading Done!")
    target = 'label'
    predictors = train_df.columns.values.tolist()[1:-1]
    categorical = None

    X_train = train_df.drop(['bird_id', 'label'], axis=1)
    labels = train_df['label']
    #cat

    seeds = np.random.randint(5000, 10000, size=10).tolist()
    auc_lst = []
    auc_lst1 = []
    n_estimators_lst = []
    stratified = True
    debug = True
    param = list_param('cat')
    oof_preds_folds = np.zeros((train_df.shape[0], len(seeds)))
    sub_preds_folds = np.zeros((test_df.shape[0], len(seeds)))
    sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds)))
    oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds)))
    feature_importance_df_folds = pd.DataFrame()

    list_thresholds_global = []
    for seed_id in range(len(seeds)):
        if stratified:
            folds = StratifiedKFold(n_splits=nfold,
                                    shuffle=True,
                                    random_state=seeds[seed_id])
        else:
            folds = KFold(n_splits=nfold, shuffle=True, random_state=1001)
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        oof_preds_local_vote = np.zeros(train_df.shape[0])
        sub_preds_local_vote = np.zeros((test_df.shape[0], nfold))
        feature_importance_df = pd.DataFrame()

        gfold_Id = list(folds.split(X_train, labels))
        params_iter = {
            'iterations': 5000,  # int
            'border_count': 128,  # (128) 1 - 255
            'bootstrap_type': 'Bernoulli',
            'loss_function': 'Logloss',
            'eval_metric': 'F1',  # 'AUC',
            'od_type': 'Iter',
            'allow_writing_files': False,
            'early_stopping_rounds': 50,
            'custom_metric': ['AUC'],
            'random_seed': seeds[seed_id],
            'use_best_model': True
        }
        param.update(params_iter)

        pool = ctb.Pool(train_df[predictors], train_df[target])

        bst1 = ctb.cv(pool=pool,
                      params=param,
                      fold_count=10,
                      partition_random_seed=seeds[seed_id],
                      stratified=True)

        res0 = pd.DataFrame(bst1)

        n_estimators = res0['test-F1-mean'].argmax() + 1

        params_iter2 = {
            'iterations': n_estimators,
        }
        param.update(params_iter2)

        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(X_train, labels)):
            if 'use_best_model' in param:
                param.__delitem__("use_best_model")

            pool_0 = ctb.Pool(train_df[predictors].iloc[train_idx],
                              train_df[target].iloc[train_idx])

            clf = ctb.train(pool=pool_0, params=param)

            #oof_preds[valid_idx] = clf.predict(train_df[predictors].iloc[valid_idx], prediction_type='Probability')[:, 1]
            #sub_preds += (clf.predict(test_df[predictors], prediction_type='Probability')[:, 1]) / folds.n_splits

            oof_preds[valid_idx] = clf.predict(
                train_df[predictors].iloc[valid_idx],
                prediction_type='Probability')[:, 1]
            pred = clf.predict(test_df[predictors],
                               prediction_type='Probability')[:, 1]
            sub_preds += pred / folds.n_splits

            fpr, tpr, thresholds = metrics.roc_curve(
                train_df[target].iloc[valid_idx], oof_preds[valid_idx])
            optimal_idx = np.argmax(tpr - fpr)
            optimal_thresholds = thresholds[optimal_idx]

            list_thresholds_global.append(optimal_thresholds)

            sub_preds_local_vote[:, n_fold] = [
                1 if y_cont > optimal_thresholds else 0 for y_cont in pred
            ]
            oof_preds_local_vote[valid_idx] = [
                1 if y_cont > optimal_thresholds else 0
                for y_cont in oof_preds[valid_idx]
            ]

            fold_importance_df = pd.DataFrame(
                list(
                    zip(train_df[predictors].iloc[train_idx].dtypes.index,
                        clf.get_feature_importance(pool_0))),
                columns=['feature', 'importance'])

            fold_importance_df = fold_importance_df.sort_values(
                by='importance',
                ascending=False,
                inplace=False,
                kind='quicksort',
                na_position='last')
            fold_importance_df["fold"] = n_fold + 1
            fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id])

            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1,
                   roc_auc_score(train_df[target].iloc[valid_idx],
                                 oof_preds[valid_idx])))

            del clf, pool_0
            gc.collect()

        oof_preds_folds[:, seed_id] = oof_preds
        sub_preds_folds[:, seed_id] = sub_preds
        from scipy import stats

        a, b = stats.mode(sub_preds_local_vote, axis=1)
        oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote
        sub_preds_folds_vote[:, seed_id] = a.reshape(-1)
        feature_importance_df_folds = pd.concat(
            [feature_importance_df_folds, feature_importance_df], axis=0)
        auc_lst.append(ml_metrics.auc(train_df[target], oof_preds))
        auc_lst1.append(roc_auc_score(train_df[target], oof_preds))
        print('Full AUC score %.6f' %
              roc_auc_score(train_df[target], oof_preds))
        print("auc_lst1")
        print(auc_lst1)

    print(list_thresholds_global)
    #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))])
    #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))])
    oof_preds_folds_vote = pd.DataFrame(
        oof_preds_folds_vote,
        columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))])
    sub_preds_folds_vote = pd.DataFrame(
        sub_preds_folds_vote,
        columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))])

    #oof_preds_folds.to_csv("./output/" + task + "_train_stack/cat.csv", index=False)
    #sub_preds_folds.to_csv("./output/" + task + "_test_stack/cat.csv", index=False)
    oof_preds_folds_vote.to_csv("./output/" + task +
                                "_train_stack_vote/cat.csv",
                                index=False)
    sub_preds_folds_vote.to_csv("./output/" + task +
                                "_test_stack_vote/cat.csv",
                                index=False)
    feature_importance_df_folds = feature_importance_df_folds.sort_values(
        'importance', ascending=False)
    feature_importance_df_folds.to_csv("./output/" + task + "_feature/cat.csv",
                                       index=False)