Пример #1
0
def train(args, logger):
    '''
    policy
    ------------
    * use original functions only if there's no pre-coded functions
        in useful libraries such as sklearn.

    todos
    ------------
    * load features
    * train the model
    * save the followings
        * logs
        * oofs
        * importances
        * trained models
        * submissions (if test mode)

    '''
    # -- Prepare for training
    exp_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    train_base_dir = './inputs/train/'
    configs = load_configs('./config.yml', logger)

    # -- Load train data
    sel_log('loading training data ...', None)
    target = pd.read_pickle(train_base_dir + 'target.pkl.gz',
                            compression='gzip')
    id_measurement = pd.read_pickle(train_base_dir + 'id_measurement.pkl.gz',
                                    compression='gzip')
    # Cache can be used only in train
    if args.use_cached_features:
        features_df = pd.read_pickle('./inputs/train/cached_featurse.pkl.gz',
                                     compression='gzip')
    else:
        features_df = load_features(configs['features'], train_base_dir,
                                    logger)
        # gen cache file if specified for the next time
        if args.gen_cached_features:
            features_df.to_pickle('./inputs/train/cached_featurse.pkl.gz',
                                  compression='gzip')
    if configs['train']['feature_selection']:
        features_df = select_features(
            features_df, configs['train']['feature_select_path'],
            configs['train']['feature_select_metric'],
            configs['train']['feature_topk'])
    features = features_df.columns

    # -- Data resampling
    # Stock original data for validation
    if configs['preprocess']['resampling']:
        target, id_measurement, features_df = resampling(
            target, id_measurement, features_df,
            configs['preprocess']['resampling_type'],
            configs['preprocess']['resampling_seed'], logger)
    sel_log(f'the shape features_df is {features_df.shape}', logger)

    # -- Split using group k-fold w/ shuffling
    # NOTE: this is not stratified, I wanna implement it in the future
    if configs['train']['fold_type'] == 'gkf':
        gkf = GroupKFold(configs['train']['fold_num'])
        folds = gkf.split(features_df, target, groups=id_measurement)
    elif configs['train']['fold_type'] == 'skf':
        skf = StratifiedKFold(configs['train']['fold_num'], random_state=71)
        folds = skf.split(features_df, target, groups=id_measurement)
    else:
        sel_log(f"ERROR: wrong fold_type, {configs['train']['fold_type']}",
                None)
    # gss = GroupShuffleSplit(configs['train']['fold_num'], random_state=71)
    # folds = gss.split(features_df, target, groups=id_measurement)
    folds, pred_folds = tee(folds)
    if configs['train']['label_train']:
        folds, folds_2 = tee(folds)
        folds, pred_folds_2 = tee(folds)

    # -- Make training dataset
    train_set = mlgb.Dataset(features_df.values, target.values)

    # -- CV
    # Set params
    PARAMS = configs['lgbm_params']
    PARAMS['nthread'] = args.nthread

    sel_log('start training ...', None)
    hist, cv_model = mlgb.cv(
        params=PARAMS,
        num_boost_round=10000,
        folds=folds,
        train_set=train_set,
        verbose_eval=50,
        early_stopping_rounds=200,
        metrics='auc',
        # feval=lgb_MCC,
        callbacks=[log_evaluation(logger, period=50)],
    )

    # -- Prediction
    if configs['train']['single_model']:
        best_iter = cv_model.best_iteration
        single_train_set = lightgbm.Dataset(features_df.values, target.values)
        single_booster = lightgbm.train(
            params=PARAMS,
            num_boost_round=int(best_iter * 1.3),
            train_set=single_train_set,
            valid_sets=[single_train_set],
            verbose_eval=50,
            early_stopping_rounds=200,
            callbacks=[log_evaluation(logger, period=50)],
        )

        # Calc cv mcc
        _oofs = []
        _y_trues = []
        for i, idxes in tqdm(list(enumerate(pred_folds))):
            trn_idx, val_idx = idxes
            booster = cv_model.boosters[i]

            # Get and store oof and y_true
            y_pred = booster.predict(features_df.values[val_idx])
            y_true = target.values[val_idx]
            _oofs.append(y_pred)
            _y_trues.append(y_true)
        cv_MCC, _ = calc_best_MCC(_y_trues, _oofs, bins=3000)
        sel_log(f'cv_MCC: {cv_MCC}', logger)

        # Save important info
        oofs = [single_booster.predict(features_df.values)]
        y_trues = [target]
        val_idxes = [features_df.index]
        scores = []
        y_true, y_pred = target, oofs[0]
        fold_importance_df = pd.DataFrame()
        fold_importance_df['split'] = single_booster.\
            feature_importance('split')
        fold_importance_df['gain'] = single_booster.\
            feature_importance('gain')
        fold_importance_dict = {0: fold_importance_df}
    else:
        sel_log('predicting using cv models ...', logger)
        oofs = []
        y_trues = []
        val_idxes = []
        scores = []
        fold_importance_dict = {}
        for i, idxes in tqdm(list(enumerate(pred_folds))):
            trn_idx, val_idx = idxes
            booster = cv_model.boosters[i]

            # Get and store oof and y_true
            y_pred = booster.predict(features_df.values[val_idx])
            y_true = target.values[val_idx]
            oofs.append(y_pred)
            y_trues.append(y_true)
            val_idxes.append(val_idx)

            # Calc MCC using thresh of 0.5
            MCC = calc_MCC(y_true, y_pred, 0.5)
            scores.append(MCC)

            # Save importance info
            fold_importance_df = pd.DataFrame()
            fold_importance_df['split'] = booster.feature_importance('split')
            fold_importance_df['gain'] = booster.feature_importance('gain')
            fold_importance_dict[i] = fold_importance_df

#        y_true = np.concatenate(y_trues, axis=0)
#        y_pred = np.concatenate(oofs, axis=0)
        sel_log(f'MCC_mean: {np.mean(scores)}, MCC_std: {np.std(scores)}',
                logger)

    # Calc best MCC
    sel_log('calculating the best MCC ...', None)
    best_MCC, best_threshs = calc_best_MCC(y_trues, oofs, bins=3000)
    sel_log(f'best_threshs: {best_threshs}', logger)
    sel_log(f'best_MCC: {best_MCC}', logger)

    # -- Post processings
    filename_base = f'{args.exp_ids[0]}_{exp_time}_{best_MCC:.4}'

    # Save oofs
    with open('./oofs/' + filename_base + '_oofs.pkl', 'wb') as fout:
        pickle.dump([val_idxes, oofs, best_threshs], fout)

    # Save importances
    # save_importance(configs['features'], fold_importance_dict,
    save_importance(features, fold_importance_dict,
                    './importances/' + filename_base + '_importances')

    # Save trained models
    with open('./trained_models/' + filename_base + '_models.pkl',
              'wb') as fout:
        pickle.dump(
            single_booster if configs['train']['single_model'] else cv_model,
            fout)


#    # -- Retrainig using the preds
#    if configs['train']['label_train']:
#        # -- Make training dataset
#        y_preds_df = y_preds_features(oofs, val_idxes)
#        features_df_2 = features_df
#        features_df_2 = pd.concat([features_df_2, y_preds_df], axis=1)
#        features_2 = features_df_2.columns
#        train_set_2 = mlgb.Dataset(features_df_2.values, target.values)
#
#        # -- CV
#        sel_log('RETRAINED -- start training ...', None)
#        hist_2, cv_model_2 = mlgb.cv(
#            params=PARAMS,
#            num_boost_round=10000,
#            folds=folds_2,
#            train_set=train_set_2,
#            verbose_eval=50,
#            early_stopping_rounds=200,
#            metrics='auc',
#            # feval=lgb_MCC,
#            callbacks=[log_evaluation(logger, period=50)],
#        )
#
#        # -- Prediction
#        sel_log('RETRAINED -- predicting ...', logger)
#        oofs_2 = []
#        y_trues_2 = []
#        val_idxes_2 = []
#        scores_2 = []
#        fold_importance_dict_2 = {}
#        for i, idxes in tqdm(list(enumerate(pred_folds_2))):
#            trn_idx, val_idx = idxes
#            booster = cv_model_2.boosters[i]
#
#            # Get and store oof and y_true
#            y_pred = booster.predict(features_df_2.values[val_idx])
#            y_true = target.values[val_idx]
#            oofs_2.append(y_pred)
#            y_trues_2.append(y_true)
#            val_idxes_2.append(val_idx)
#
#            # Calc MCC using thresh of 0.5
#            MCC = calc_MCC(y_true, y_pred, 0.5)
#            scores_2.append(MCC)
#
#            # Save importance info
#            fold_importance_df = pd.DataFrame()
#            fold_importance_df['split'] = booster.feature_importance('split')
#            fold_importance_df['gain'] = booster.feature_importance('gain')
#            fold_importance_dict_2[i] = fold_importance_df
#
#        sel_log(
#            f'RETRAINED -- MCC_mean: {np.mean(scores_2)}, MCC_std: {np.std(scores_2)}',
#            logger)
#
#        # Calc best MCC
#        sel_log('RETRAINED -- calculating the best MCC ...', None)
#        y_true_2 = np.concatenate(y_trues_2, axis=0)
#        y_pred_2 = np.concatenate(oofs_2, axis=0)
#        best_MCC_2, best_thresh_2 = calc_best_MCC(y_true_2, y_pred_2, bins=3000)
#        sel_log(
#            f'RETRAINED -- best_MCC: {best_MCC_2}, best_thresh: {best_thresh_2}',
#            logger)
#
#        # -- Post processings
#        filename_base = f'{args.exp_ids[0]}_{exp_time}_{best_MCC_2:.4}_{best_thresh_2:.3}'
#
#        # Save oofs
#        with open('./oofs/' + filename_base + '_oofs_retrained.pkl', 'wb') as fout:
#            pickle.dump([val_idxes_2, oofs_2], fout)
#
#        # Save importances
#        save_importance(features_2, fold_importance_dict_2,
#                        './importances/' + filename_base + '_importances_retrained')
#
#        # Save trained models
#        with open(
#                './trained_models/' + filename_base + '_models_retrained.pkl', 'wb') as fout:
#            pickle.dump(cv_model_2, fout)

# --- Make submission file
    if args.test:
        # -- Prepare for test
        test_base_dir = './inputs/test/'

        sel_log('loading test data ...', None)
        test_features_df = load_features(configs['features'], test_base_dir,
                                         logger)
        if configs['train']['feature_selection']:
            test_features_df = select_features(
                test_features_df, configs['train']['feature_select_path'],
                configs['train']['feature_select_metric'],
                configs['train']['feature_topk'])

        # -- Prediction
        sel_log('predicting for test ...', None)
        preds = []
        models = [single_booster
                  ] if configs['train']['single_model'] else cv_model.boosters
        for booster, best_thresh in tqdm(zip(models, best_threshs)):
            pred = booster.predict(test_features_df.values)
            # preds.append(pred * 0.5 / best_thresh)
            preds.append(pred > best_thresh)
        sub_values = np.mean(preds, axis=0)
        target_values = (sub_values > 0.5).astype(np.int32)

        #        if configs['train']['label_train']:
        #            # -- Use retrained info
        #            test_y_preds_df = y_preds_features(
        #                [sub_values], [np.arange(len(sub_values))])
        #            test_features_df = pd.concat(
        #                [test_features_df, test_y_preds_df], axis=1)
        #            sel_log('RETRAINED -- predicting ...', None)
        #            preds = []
        #            for booster in tqdm(cv_model_2.boosters):
        #                preds.append(booster.predict(test_features_df.values))
        #            sub_values = np.mean(preds, axis=0)
        #            target_values = (sub_values > best_thresh_2).astype(np.int32)

        # -- Make submission file
        sel_log(f'loading sample submission file ...', None)
        sub_df = pd.read_csv('./inputs/origin/sample_submission.csv')
        sub_df.target = target_values

        # print stats
        sel_log(f'prositive percentage: \
                {sub_df.target.sum()/sub_df.target.count()*100:.3}%',
                logger=logger)

        submission_filename = f'./submissions/{filename_base}_sub.csv.gz'
        sel_log(f'saving submission file to {submission_filename}', logger)
        sub_df.to_csv(submission_filename, compression='gzip', index=False)
Пример #2
0
def t003_lgb_train(args, script_name, configs, logger):
    '''
    policy
    ------------
    * use original functions only if there's no pre-coded functions
        in useful libraries such as sklearn.

    todos
    ------------
    * load features
    * train the model
    * save the followings
        * logs
        * oofs
        * importances
        * trained models
        * submissions (if test mode)

    '''
    # -- Prepare for training
    exp_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

    # -- Load train data
    sel_log('loading training data ...', None)
    trn_ids = pd.read_pickle(NES_DIR + 'trn_ID_code_w_fakes.pkl.gz',
                             compression='gzip')
    tst_ids = pd.read_pickle(NES_DIR + 'tst_ID_code.pkl.gz',
                             compression='gzip')
    target = pd.read_pickle(NES_DIR + 'target_w_fakes.pkl.gz',
                            compression='gzip')
    if args.debug:
        sample_idxes = trn_ids.reset_index(drop=True).sample(random_state=71,
                                                             frac=0.05).index
        target = target.iloc[sample_idxes].reset_index(drop=True)
        trn_ids = trn_ids.iloc[sample_idxes].reset_index(drop=True)

    # load features
    if configs['train']['all_features']:
        _features = get_all_features(FEATURE_DIR)
    else:
        _features = configs['features']
    trn_tst_df = load_features(_features, FEATURE_DIR, logger=logger)\
        .set_index('ID_code')

    # feature selection if needed
    if configs['train']['feature_selection']:
        trn_tst_df = select_features(trn_tst_df,
                                     configs['train']['feature_select_path'],
                                     configs['train']['metric'],
                                     configs['train']['feature_topk'])
    features = trn_tst_df.columns

    # split train and test
    sel_log(f'now splitting the df to train and test ones ...', None)
    features_df = trn_tst_df.loc[trn_ids].reset_index(drop=True)
    test_features_df = trn_tst_df.loc[tst_ids].reset_index(drop=True)

    # -- Split using group k-fold w/ shuffling
    # NOTE: this is not stratified, I wanna implement it in the future
    if configs['train']['fold_type'] == 'skf':
        skf = StratifiedKFold(configs['train']['fold_num'], random_state=71)
        folds = skf.split(features_df, target)
        configs['train']['single_model'] = False
    else:
        print(f"ERROR: wrong fold_type, {configs['train']['fold_type']}")
    folds, pred_folds = tee(folds)

    # -- Make training dataset
    # print shape
    sel_log(f'used features are {features_df.columns.tolist()}', logger)
    sel_log(f'the shape features_df is {features_df.shape}', logger)

    # -- CV
    # Set params
    PARAMS = configs['lgbm_params']
    PARAMS['nthread'] = os.cpu_count()

    sel_log('start training ...', None)
    oofs = []
    y_trues = []
    val_idxes = []
    scores = []
    fold_importance_dict = {}
    cv_model = []
    for i, idxes in tqdm(list(enumerate(folds))):
        trn_idx, val_idx = idxes
        # -- Data resampling
        # Stock original data for validation
        fold_features_df, fold_target = value_resampling(
            features_df.iloc[trn_idx],
            target.iloc[trn_idx],
            configs['train']['sampling_type'],
            configs['train']['sampling_random_state'],
            configs['train']['os_lim'],
            configs['train']['pos_t'],
            configs['train']['neg_t'],
            logger=logger)

        # make lgbm dataset
        train_set = lightgbm.Dataset(fold_features_df, fold_target)
        valid_set = lightgbm.Dataset(
            features_df.iloc[val_idx],
            target[val_idx],
        )
        # train
        booster = lightgbm.train(
            params=PARAMS.copy(),
            train_set=train_set,
            num_boost_round=100000,
            valid_sets=[valid_set, train_set],
            verbose_eval=1000,
            early_stopping_rounds=2000,
            callbacks=[log_evaluation(logger, period=1000)],
        )

        # predict using trained model
        y_pred = booster.predict(features_df.values[val_idx],
                                 num_iteration=None)
        y_true = target.values[val_idx]
        oofs.append(y_pred)
        y_trues.append(y_true)
        val_idxes.append(val_idx)

        # Calc AUC
        auc = roc_auc_score(y_true, y_pred)
        sel_log(f'fold AUC: {auc}', logger=logger)
        scores.append(auc)

        # Save importance info
        fold_importance_df = pd.DataFrame()
        fold_importance_df['split'] = booster.feature_importance('split')
        fold_importance_df['gain'] = booster.feature_importance('gain')
        fold_importance_dict[i] = fold_importance_df

        # save model
        cv_model.append(booster)

    auc_mean, auc_std = np.mean(scores), np.std(scores)
    sel_log(f'AUC_mean: {auc_mean:.5f}, AUC_std: {auc_std:.5f}', logger)

    # -- Post processings
    filename_base = f'{script_name}_{exp_time}_{auc_mean:.5}'

    # Save oofs
    with open('./mnt/oofs/' + filename_base + '_oofs.pkl', 'wb') as fout:
        pickle.dump([val_idxes, oofs], fout)

    # Save importances
    # save_importance(configs['features'], fold_importance_dict,
    save_importance(features,
                    fold_importance_dict,
                    './mnt/importances/' + filename_base + '_importances',
                    topk=100,
                    main_metric='gain')

    # Save trained models
    with open('./mnt/trained_models/' + filename_base + '_models.pkl',
              'wb') as fout:
        pickle.dump(cv_model, fout)

    # --- Make submission file
    if args.test:
        if configs['train']['single_model']:
            # train single model
            best_iter = np.mean(
                [booster.best_iteration for booster in cv_model])
            single_train_set = lightgbm.Dataset(features_df, target.values)
            single_booster = lightgbm.train(
                params=PARAMS,
                num_boost_round=int(best_iter * 1.3),
                train_set=single_train_set,
                verbose_eval=1000,
                callbacks=[log_evaluation(logger, period=1000)],
            )
            # re-save model for prediction
            # cv_model.append(single_booster)

        # -- Prediction
        sel_log('predicting for test ...', None)
        preds = []
        # for booster in tqdm(cv_model.boosters):
        for booster in tqdm(cv_model):
            pred = booster.predict(test_features_df.values, num_iteration=None)
            pred = pd.Series(pred)
            preds.append(pred.rank() / pred.shape)
        if len(cv_model) > 1:
            target_values = np.mean(preds, axis=0)
        else:
            target_values = preds[0]
        # blend single model
        if configs['train']['single_model']:
            pred = single_booster.predict(test_features_df.values,
                                          num_iteration=None)
            pred = pd.Series(pred)
            target_values = (target_values + (pred.rank() / pred.shape)) / 2

        # -- Make submission file
        sel_log(f'loading sample submission file ...', None)
        sub_df = pd.read_csv('./mnt/inputs/origin/sample_submission.csv.zip',
                             compression='zip')
        sub_df.target = target_values

        # print stats
        submission_filename = f'./mnt/submissions/{filename_base}_sub.csv.gz'
        sel_log(f'saving submission file to {submission_filename}', logger)
        sub_df.to_csv(submission_filename, compression='gzip', index=False)
        if args.submit:
            os.system(f'kaggle competitions submit '
                      f'santander-customer-transaction-prediction '
                      f'-f {submission_filename} -m "{args.message}"')
    return auc_mean, auc_std
Пример #3
0
def train(args, logger):
    '''
    policy
    ------------
    * use original functions only if there's no pre-coded functions
        in useful libraries such as sklearn.

    todos
    ------------
    * load features
    * train the model
    * save the followings
        * logs
        * oofs
        * importances
        * trained models
        * submissions (if test mode)

    '''
    # -- Prepare for training
    exp_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    train_base_dir = './inputs/train/'
    configs = load_configs('./config.yml', logger)

    # -- Load train data
    sel_log('loading training data ...', None)
    target = pd.read_pickle(train_base_dir + 'target.pkl.gz',
                            compression='gzip')
    outliers = pd.read_pickle(train_base_dir + 'outliers.pkl.gz',
                              compression='gzip')
    # Cache can be used only in train
    if args.use_cached_features:
        features_df = pd.read_pickle('./inputs/train/cached_featurse.pkl.gz',
                                     compression='gzip')
    else:
        if configs['train']['all_features']:
            _features = get_all_features('./inputs/train/')
        else:
            _features = configs['features']
        features_df = load_features(_features, train_base_dir, logger)
        # gen cache file if specified for the next time
        if args.gen_cached_features:
            features_df.to_pickle('./inputs/train/cached_featurse.pkl.gz',
                                  compression='gzip')
    # remove invalid features
    features_df.drop(configs['invalid_features'], axis=1, inplace=True)
    # remove invalid rows
    if configs['train']['rm_outliers']:
        features_df = features_df[outliers == 0]
        target = target[outliers == 0]
    features_df
    # label encoding categorical features
    sel_log('loading test data ...', None)
    test_base_dir = './inputs/test/'
    test_features_df = load_features(features_df.columns, test_base_dir,
                                     logger)
    # feature selection if needed
    if configs['train']['feature_selection']:
        features_df = select_features(features_df,
                                      configs['train']['feature_select_path'],
                                      configs['train']['metric'],
                                      configs['train']['feature_topk'])
        test_features_df = select_features(
            test_features_df, configs['train']['feature_select_path'],
            configs['train']['metric'], configs['train']['feature_topk'])

    features = features_df.columns
    # clarify the used categorical features
    # also encoding categorical features
    if configs['categorical_features']:
        categorical_features = sorted(
            list(set(features) & set(configs['categorical_features'])))
        trn_tst_df = pd.concat([features_df, test_features_df], axis=0)
        sel_log('label encoding ...', None)
        trn_tst_df, le_dict = label_encoding(trn_tst_df,
                                             fit_columns=categorical_features)
        features_df = trn_tst_df.iloc[:features_df.shape[0]]
        test_features_df = trn_tst_df.iloc[features_df.shape[0]:]
    else:
        categorical_features = None

    # categorical_features = get_locs(
    #     features_df, configs['categorical_features'])

    sel_log(f'the shape features_df is {features_df.shape}', logger)

    # -- Split using group k-fold w/ shuffling
    # NOTE: this is not stratified, I wanna implement it in the future
    if configs['train']['fold_type'] == 'ukf':
        ukf = UniformKFold(configs['train']['fold_num'])
        folds = ukf.split(features_df, target)
    elif configs['train']['fold_type'] == 'skf':
        skf = StratifiedKFold(configs['train']['fold_num'], random_state=71)
        folds = skf.split(features_df, outliers)
    else:
        print(f"ERROR: wrong fold_type, {configs['train']['fold_type']}")
    folds, pred_folds = tee(folds)

    # -- Make training dataset
    #    train_set = mlgb.Dataset(features_df, target,
    #                             categorical_feature=categorical_features)
    #    train_set = mlgb.Dataset(features_df.values, target.values,)
    #                             feature_name=features,
    #                             categorical_feature=configs['categorical_features'])

    # -- CV
    # Set params
    PARAMS = configs['lgbm_params']
    PARAMS['nthread'] = args.nthread
    # PARAMS['categorical_feature'] = categorical_features

    sel_log('start training ...', None)
    cv_model = []
    for i, idxes in tqdm(list(enumerate(folds))):
        trn_idx, val_idx = idxes
        # -- Data resampling
        # Stock original data for validation
        if configs['preprocess']['resampling']:
            trn_idx = resampling(outliers[trn_idx],
                                 configs['preprocess']['resampling_type'],
                                 configs['preprocess']['resampling_seed'],
                                 configs['preprocess']['os_lim'])

        train_set = lightgbm.Dataset(
            features_df.iloc[trn_idx],
            target[trn_idx],
        )
        #                                     categorical_feature=categorical_features)
        valid_set = lightgbm.Dataset(
            features_df.iloc[val_idx],
            target[val_idx],
        )
        #                                     categorical_feature=categorical_features)
        booster = lightgbm.train(
            params=PARAMS.copy(),
            train_set=train_set,
            num_boost_round=20000,
            valid_sets=[valid_set, train_set],
            verbose_eval=100,
            early_stopping_rounds=200,
            categorical_feature=categorical_features,
            callbacks=[log_evaluation(logger, period=100)],
        )
        cv_model.append(booster)


#    hist, cv_model = mlgb.cv(
#        params=PARAMS,
#        num_boost_round=10000,
#        folds=folds,
#        train_set=train_set,
#        verbose_eval=100,
#        early_stopping_rounds=200,
#        metrics='rmse',
#        callbacks=[log_evaluation(logger, period=100)],
#    )

# -- Prediction
    if configs['train']['single_model']:
        best_iter = cv_model.best_iteration
        single_train_set = lightgbm.Dataset(features_df.values, target.values)
        single_booster = lightgbm.train(
            params=PARAMS,
            num_boost_round=int(best_iter * 1.3),
            train_set=single_train_set,
            valid_sets=[single_train_set],
            verbose_eval=100,
            early_stopping_rounds=200,
            callbacks=[log_evaluation(logger, period=100)],
        )
        oofs = [single_booster.predict(features_df.values)]
        y_trues = [target]
        val_idxes = [features_df.index]
        scores = []
        y_true, y_pred = target, oofs[0]
        fold_importance_df = pd.DataFrame()
        fold_importance_df['split'] = single_booster.\
            feature_importance('split')
        fold_importance_df['gain'] = single_booster.\
            feature_importance('gain')
        fold_importance_dict = {0: fold_importance_df}
    else:
        sel_log('predicting using cv models ...', logger)
        oofs = []
        y_trues = []
        val_idxes = []
        scores = []
        outlier_scores = []
        non_outlier_scores = []
        fold_importance_dict = {}
        for i, idxes in tqdm(list(enumerate(pred_folds))):
            trn_idx, val_idx = idxes
            # booster = cv_model.boosters[i]
            booster = cv_model[i]

            # Get and store oof and y_true
            y_pred = booster.predict(features_df.values[val_idx],
                                     num_iteration=None)
            y_true = target.values[val_idx]
            oofs.append(y_pred)
            y_trues.append(y_true)
            val_idxes.append(val_idx)

            # Calc RMSE
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            scores.append(rmse)
            fold_outlires = y_true < -30
            out_rmse = np.sqrt(
                mean_squared_error(y_true[fold_outlires],
                                   y_pred[fold_outlires]))
            outlier_scores.append(out_rmse)
            fold_non_outlires = y_true > -30
            non_out_rmse = np.sqrt(
                mean_squared_error(y_true[fold_non_outlires],
                                   y_pred[fold_non_outlires]))
            non_outlier_scores.append(non_out_rmse)

            # Save importance info
            fold_importance_df = pd.DataFrame()
            fold_importance_df['split'] = booster.feature_importance('split')
            fold_importance_df['gain'] = booster.feature_importance('gain')
            fold_importance_dict[i] = fold_importance_df

        rmse_mean, rmse_std = np.mean(scores), np.std(scores)
        out_rmse_mean, out_rmse_std = np.mean(outlier_scores), np.std(
            outlier_scores)
        non_out_rmse_mean, non_out_rmse_std = np.mean(
            non_outlier_scores), np.std(non_outlier_scores)
        sel_log(f'RMSE_mean: {rmse_mean:.4f}, RMSE_std: {rmse_std:.4f}',
                logger)
        sel_log(
            f'OUT_RMSE_mean: {out_rmse_mean:.4f}, OUT_RMSE_std: {out_rmse_std:.4f}',
            logger)
        sel_log(
            f'NON_OUT_RMSE_mean: {non_out_rmse_mean:.4f}, NON_OUT_RMSE_std: {non_out_rmse_std:.4f}',
            logger)

    # -- Post processings
    filename_base = f'{args.exp_ids[0]}_{exp_time}_{rmse_mean:.4}'

    # Save oofs
    with open('./oofs/' + filename_base + '_oofs.pkl', 'wb') as fout:
        pickle.dump([val_idxes, oofs], fout)

    # Save importances
    # save_importance(configs['features'], fold_importance_dict,
    save_importance(features,
                    fold_importance_dict,
                    './importances/' + filename_base + '_importances',
                    topk=100,
                    main_metric='split')

    # Save trained models
    with open('./trained_models/' + filename_base + '_models.pkl',
              'wb') as fout:
        pickle.dump(
            single_booster if configs['train']['single_model'] else cv_model,
            fout)

    # --- Make submission file
    if args.test:
        #        # -- Prepare for test
        #        test_base_dir = './inputs/test/'
        #
        #        sel_log('loading test data ...', None)
        #        test_features_df = load_features(
        #            features, test_base_dir, logger)
        #        # label encoding
        #        sel_log('encoding categorical features ...', None)
        #        test_features_df = fill_unseens(features_df, test_features_df,
        #                                        configs['categorical_features'],
        #                                        args.nthread)
        #        test_features_df, le_dict = label_encoding(test_features_df, le_dict)

        # -- Prediction
        sel_log('predicting for test ...', None)
        preds = []
        # for booster in tqdm(cv_model.boosters):
        for booster in tqdm(cv_model):
            pred = booster.predict(test_features_df.values, num_iteration=None)
            preds.append(pred)
        target_values = np.mean(preds, axis=0)

        # -- Make submission file
        sel_log(f'loading sample submission file ...', None)
        sub_df = pd.read_csv('./inputs/origin/sample_submission.csv.zip',
                             compression='zip')
        sub_df.target = target_values

        # print stats
        submission_filename = f'./submissions/{filename_base}_sub.csv.gz'
        sel_log(f'saving submission file to {submission_filename}', logger)
        sub_df.to_csv(submission_filename, compression='gzip', index=False)