def train(args, logger): ''' policy ------------ * use original functions only if there's no pre-coded functions in useful libraries such as sklearn. todos ------------ * load features * train the model * save the followings * logs * oofs * importances * trained models * submissions (if test mode) ''' # -- Prepare for training exp_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') train_base_dir = './inputs/train/' configs = load_configs('./config.yml', logger) # -- Load train data sel_log('loading training data ...', None) target = pd.read_pickle(train_base_dir + 'target.pkl.gz', compression='gzip') id_measurement = pd.read_pickle(train_base_dir + 'id_measurement.pkl.gz', compression='gzip') # Cache can be used only in train if args.use_cached_features: features_df = pd.read_pickle('./inputs/train/cached_featurse.pkl.gz', compression='gzip') else: features_df = load_features(configs['features'], train_base_dir, logger) # gen cache file if specified for the next time if args.gen_cached_features: features_df.to_pickle('./inputs/train/cached_featurse.pkl.gz', compression='gzip') if configs['train']['feature_selection']: features_df = select_features( features_df, configs['train']['feature_select_path'], configs['train']['feature_select_metric'], configs['train']['feature_topk']) features = features_df.columns # -- Data resampling # Stock original data for validation if configs['preprocess']['resampling']: target, id_measurement, features_df = resampling( target, id_measurement, features_df, configs['preprocess']['resampling_type'], configs['preprocess']['resampling_seed'], logger) sel_log(f'the shape features_df is {features_df.shape}', logger) # -- Split using group k-fold w/ shuffling # NOTE: this is not stratified, I wanna implement it in the future if configs['train']['fold_type'] == 'gkf': gkf = GroupKFold(configs['train']['fold_num']) folds = gkf.split(features_df, target, groups=id_measurement) elif configs['train']['fold_type'] == 'skf': skf = StratifiedKFold(configs['train']['fold_num'], random_state=71) folds = skf.split(features_df, target, groups=id_measurement) else: sel_log(f"ERROR: wrong fold_type, {configs['train']['fold_type']}", None) # gss = GroupShuffleSplit(configs['train']['fold_num'], random_state=71) # folds = gss.split(features_df, target, groups=id_measurement) folds, pred_folds = tee(folds) if configs['train']['label_train']: folds, folds_2 = tee(folds) folds, pred_folds_2 = tee(folds) # -- Make training dataset train_set = mlgb.Dataset(features_df.values, target.values) # -- CV # Set params PARAMS = configs['lgbm_params'] PARAMS['nthread'] = args.nthread sel_log('start training ...', None) hist, cv_model = mlgb.cv( params=PARAMS, num_boost_round=10000, folds=folds, train_set=train_set, verbose_eval=50, early_stopping_rounds=200, metrics='auc', # feval=lgb_MCC, callbacks=[log_evaluation(logger, period=50)], ) # -- Prediction if configs['train']['single_model']: best_iter = cv_model.best_iteration single_train_set = lightgbm.Dataset(features_df.values, target.values) single_booster = lightgbm.train( params=PARAMS, num_boost_round=int(best_iter * 1.3), train_set=single_train_set, valid_sets=[single_train_set], verbose_eval=50, early_stopping_rounds=200, callbacks=[log_evaluation(logger, period=50)], ) # Calc cv mcc _oofs = [] _y_trues = [] for i, idxes in tqdm(list(enumerate(pred_folds))): trn_idx, val_idx = idxes booster = cv_model.boosters[i] # Get and store oof and y_true y_pred = booster.predict(features_df.values[val_idx]) y_true = target.values[val_idx] _oofs.append(y_pred) _y_trues.append(y_true) cv_MCC, _ = calc_best_MCC(_y_trues, _oofs, bins=3000) sel_log(f'cv_MCC: {cv_MCC}', logger) # Save important info oofs = [single_booster.predict(features_df.values)] y_trues = [target] val_idxes = [features_df.index] scores = [] y_true, y_pred = target, oofs[0] fold_importance_df = pd.DataFrame() fold_importance_df['split'] = single_booster.\ feature_importance('split') fold_importance_df['gain'] = single_booster.\ feature_importance('gain') fold_importance_dict = {0: fold_importance_df} else: sel_log('predicting using cv models ...', logger) oofs = [] y_trues = [] val_idxes = [] scores = [] fold_importance_dict = {} for i, idxes in tqdm(list(enumerate(pred_folds))): trn_idx, val_idx = idxes booster = cv_model.boosters[i] # Get and store oof and y_true y_pred = booster.predict(features_df.values[val_idx]) y_true = target.values[val_idx] oofs.append(y_pred) y_trues.append(y_true) val_idxes.append(val_idx) # Calc MCC using thresh of 0.5 MCC = calc_MCC(y_true, y_pred, 0.5) scores.append(MCC) # Save importance info fold_importance_df = pd.DataFrame() fold_importance_df['split'] = booster.feature_importance('split') fold_importance_df['gain'] = booster.feature_importance('gain') fold_importance_dict[i] = fold_importance_df # y_true = np.concatenate(y_trues, axis=0) # y_pred = np.concatenate(oofs, axis=0) sel_log(f'MCC_mean: {np.mean(scores)}, MCC_std: {np.std(scores)}', logger) # Calc best MCC sel_log('calculating the best MCC ...', None) best_MCC, best_threshs = calc_best_MCC(y_trues, oofs, bins=3000) sel_log(f'best_threshs: {best_threshs}', logger) sel_log(f'best_MCC: {best_MCC}', logger) # -- Post processings filename_base = f'{args.exp_ids[0]}_{exp_time}_{best_MCC:.4}' # Save oofs with open('./oofs/' + filename_base + '_oofs.pkl', 'wb') as fout: pickle.dump([val_idxes, oofs, best_threshs], fout) # Save importances # save_importance(configs['features'], fold_importance_dict, save_importance(features, fold_importance_dict, './importances/' + filename_base + '_importances') # Save trained models with open('./trained_models/' + filename_base + '_models.pkl', 'wb') as fout: pickle.dump( single_booster if configs['train']['single_model'] else cv_model, fout) # # -- Retrainig using the preds # if configs['train']['label_train']: # # -- Make training dataset # y_preds_df = y_preds_features(oofs, val_idxes) # features_df_2 = features_df # features_df_2 = pd.concat([features_df_2, y_preds_df], axis=1) # features_2 = features_df_2.columns # train_set_2 = mlgb.Dataset(features_df_2.values, target.values) # # # -- CV # sel_log('RETRAINED -- start training ...', None) # hist_2, cv_model_2 = mlgb.cv( # params=PARAMS, # num_boost_round=10000, # folds=folds_2, # train_set=train_set_2, # verbose_eval=50, # early_stopping_rounds=200, # metrics='auc', # # feval=lgb_MCC, # callbacks=[log_evaluation(logger, period=50)], # ) # # # -- Prediction # sel_log('RETRAINED -- predicting ...', logger) # oofs_2 = [] # y_trues_2 = [] # val_idxes_2 = [] # scores_2 = [] # fold_importance_dict_2 = {} # for i, idxes in tqdm(list(enumerate(pred_folds_2))): # trn_idx, val_idx = idxes # booster = cv_model_2.boosters[i] # # # Get and store oof and y_true # y_pred = booster.predict(features_df_2.values[val_idx]) # y_true = target.values[val_idx] # oofs_2.append(y_pred) # y_trues_2.append(y_true) # val_idxes_2.append(val_idx) # # # Calc MCC using thresh of 0.5 # MCC = calc_MCC(y_true, y_pred, 0.5) # scores_2.append(MCC) # # # Save importance info # fold_importance_df = pd.DataFrame() # fold_importance_df['split'] = booster.feature_importance('split') # fold_importance_df['gain'] = booster.feature_importance('gain') # fold_importance_dict_2[i] = fold_importance_df # # sel_log( # f'RETRAINED -- MCC_mean: {np.mean(scores_2)}, MCC_std: {np.std(scores_2)}', # logger) # # # Calc best MCC # sel_log('RETRAINED -- calculating the best MCC ...', None) # y_true_2 = np.concatenate(y_trues_2, axis=0) # y_pred_2 = np.concatenate(oofs_2, axis=0) # best_MCC_2, best_thresh_2 = calc_best_MCC(y_true_2, y_pred_2, bins=3000) # sel_log( # f'RETRAINED -- best_MCC: {best_MCC_2}, best_thresh: {best_thresh_2}', # logger) # # # -- Post processings # filename_base = f'{args.exp_ids[0]}_{exp_time}_{best_MCC_2:.4}_{best_thresh_2:.3}' # # # Save oofs # with open('./oofs/' + filename_base + '_oofs_retrained.pkl', 'wb') as fout: # pickle.dump([val_idxes_2, oofs_2], fout) # # # Save importances # save_importance(features_2, fold_importance_dict_2, # './importances/' + filename_base + '_importances_retrained') # # # Save trained models # with open( # './trained_models/' + filename_base + '_models_retrained.pkl', 'wb') as fout: # pickle.dump(cv_model_2, fout) # --- Make submission file if args.test: # -- Prepare for test test_base_dir = './inputs/test/' sel_log('loading test data ...', None) test_features_df = load_features(configs['features'], test_base_dir, logger) if configs['train']['feature_selection']: test_features_df = select_features( test_features_df, configs['train']['feature_select_path'], configs['train']['feature_select_metric'], configs['train']['feature_topk']) # -- Prediction sel_log('predicting for test ...', None) preds = [] models = [single_booster ] if configs['train']['single_model'] else cv_model.boosters for booster, best_thresh in tqdm(zip(models, best_threshs)): pred = booster.predict(test_features_df.values) # preds.append(pred * 0.5 / best_thresh) preds.append(pred > best_thresh) sub_values = np.mean(preds, axis=0) target_values = (sub_values > 0.5).astype(np.int32) # if configs['train']['label_train']: # # -- Use retrained info # test_y_preds_df = y_preds_features( # [sub_values], [np.arange(len(sub_values))]) # test_features_df = pd.concat( # [test_features_df, test_y_preds_df], axis=1) # sel_log('RETRAINED -- predicting ...', None) # preds = [] # for booster in tqdm(cv_model_2.boosters): # preds.append(booster.predict(test_features_df.values)) # sub_values = np.mean(preds, axis=0) # target_values = (sub_values > best_thresh_2).astype(np.int32) # -- Make submission file sel_log(f'loading sample submission file ...', None) sub_df = pd.read_csv('./inputs/origin/sample_submission.csv') sub_df.target = target_values # print stats sel_log(f'prositive percentage: \ {sub_df.target.sum()/sub_df.target.count()*100:.3}%', logger=logger) submission_filename = f'./submissions/{filename_base}_sub.csv.gz' sel_log(f'saving submission file to {submission_filename}', logger) sub_df.to_csv(submission_filename, compression='gzip', index=False)
def t003_lgb_train(args, script_name, configs, logger): ''' policy ------------ * use original functions only if there's no pre-coded functions in useful libraries such as sklearn. todos ------------ * load features * train the model * save the followings * logs * oofs * importances * trained models * submissions (if test mode) ''' # -- Prepare for training exp_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') # -- Load train data sel_log('loading training data ...', None) trn_ids = pd.read_pickle(NES_DIR + 'trn_ID_code_w_fakes.pkl.gz', compression='gzip') tst_ids = pd.read_pickle(NES_DIR + 'tst_ID_code.pkl.gz', compression='gzip') target = pd.read_pickle(NES_DIR + 'target_w_fakes.pkl.gz', compression='gzip') if args.debug: sample_idxes = trn_ids.reset_index(drop=True).sample(random_state=71, frac=0.05).index target = target.iloc[sample_idxes].reset_index(drop=True) trn_ids = trn_ids.iloc[sample_idxes].reset_index(drop=True) # load features if configs['train']['all_features']: _features = get_all_features(FEATURE_DIR) else: _features = configs['features'] trn_tst_df = load_features(_features, FEATURE_DIR, logger=logger)\ .set_index('ID_code') # feature selection if needed if configs['train']['feature_selection']: trn_tst_df = select_features(trn_tst_df, configs['train']['feature_select_path'], configs['train']['metric'], configs['train']['feature_topk']) features = trn_tst_df.columns # split train and test sel_log(f'now splitting the df to train and test ones ...', None) features_df = trn_tst_df.loc[trn_ids].reset_index(drop=True) test_features_df = trn_tst_df.loc[tst_ids].reset_index(drop=True) # -- Split using group k-fold w/ shuffling # NOTE: this is not stratified, I wanna implement it in the future if configs['train']['fold_type'] == 'skf': skf = StratifiedKFold(configs['train']['fold_num'], random_state=71) folds = skf.split(features_df, target) configs['train']['single_model'] = False else: print(f"ERROR: wrong fold_type, {configs['train']['fold_type']}") folds, pred_folds = tee(folds) # -- Make training dataset # print shape sel_log(f'used features are {features_df.columns.tolist()}', logger) sel_log(f'the shape features_df is {features_df.shape}', logger) # -- CV # Set params PARAMS = configs['lgbm_params'] PARAMS['nthread'] = os.cpu_count() sel_log('start training ...', None) oofs = [] y_trues = [] val_idxes = [] scores = [] fold_importance_dict = {} cv_model = [] for i, idxes in tqdm(list(enumerate(folds))): trn_idx, val_idx = idxes # -- Data resampling # Stock original data for validation fold_features_df, fold_target = value_resampling( features_df.iloc[trn_idx], target.iloc[trn_idx], configs['train']['sampling_type'], configs['train']['sampling_random_state'], configs['train']['os_lim'], configs['train']['pos_t'], configs['train']['neg_t'], logger=logger) # make lgbm dataset train_set = lightgbm.Dataset(fold_features_df, fold_target) valid_set = lightgbm.Dataset( features_df.iloc[val_idx], target[val_idx], ) # train booster = lightgbm.train( params=PARAMS.copy(), train_set=train_set, num_boost_round=100000, valid_sets=[valid_set, train_set], verbose_eval=1000, early_stopping_rounds=2000, callbacks=[log_evaluation(logger, period=1000)], ) # predict using trained model y_pred = booster.predict(features_df.values[val_idx], num_iteration=None) y_true = target.values[val_idx] oofs.append(y_pred) y_trues.append(y_true) val_idxes.append(val_idx) # Calc AUC auc = roc_auc_score(y_true, y_pred) sel_log(f'fold AUC: {auc}', logger=logger) scores.append(auc) # Save importance info fold_importance_df = pd.DataFrame() fold_importance_df['split'] = booster.feature_importance('split') fold_importance_df['gain'] = booster.feature_importance('gain') fold_importance_dict[i] = fold_importance_df # save model cv_model.append(booster) auc_mean, auc_std = np.mean(scores), np.std(scores) sel_log(f'AUC_mean: {auc_mean:.5f}, AUC_std: {auc_std:.5f}', logger) # -- Post processings filename_base = f'{script_name}_{exp_time}_{auc_mean:.5}' # Save oofs with open('./mnt/oofs/' + filename_base + '_oofs.pkl', 'wb') as fout: pickle.dump([val_idxes, oofs], fout) # Save importances # save_importance(configs['features'], fold_importance_dict, save_importance(features, fold_importance_dict, './mnt/importances/' + filename_base + '_importances', topk=100, main_metric='gain') # Save trained models with open('./mnt/trained_models/' + filename_base + '_models.pkl', 'wb') as fout: pickle.dump(cv_model, fout) # --- Make submission file if args.test: if configs['train']['single_model']: # train single model best_iter = np.mean( [booster.best_iteration for booster in cv_model]) single_train_set = lightgbm.Dataset(features_df, target.values) single_booster = lightgbm.train( params=PARAMS, num_boost_round=int(best_iter * 1.3), train_set=single_train_set, verbose_eval=1000, callbacks=[log_evaluation(logger, period=1000)], ) # re-save model for prediction # cv_model.append(single_booster) # -- Prediction sel_log('predicting for test ...', None) preds = [] # for booster in tqdm(cv_model.boosters): for booster in tqdm(cv_model): pred = booster.predict(test_features_df.values, num_iteration=None) pred = pd.Series(pred) preds.append(pred.rank() / pred.shape) if len(cv_model) > 1: target_values = np.mean(preds, axis=0) else: target_values = preds[0] # blend single model if configs['train']['single_model']: pred = single_booster.predict(test_features_df.values, num_iteration=None) pred = pd.Series(pred) target_values = (target_values + (pred.rank() / pred.shape)) / 2 # -- Make submission file sel_log(f'loading sample submission file ...', None) sub_df = pd.read_csv('./mnt/inputs/origin/sample_submission.csv.zip', compression='zip') sub_df.target = target_values # print stats submission_filename = f'./mnt/submissions/{filename_base}_sub.csv.gz' sel_log(f'saving submission file to {submission_filename}', logger) sub_df.to_csv(submission_filename, compression='gzip', index=False) if args.submit: os.system(f'kaggle competitions submit ' f'santander-customer-transaction-prediction ' f'-f {submission_filename} -m "{args.message}"') return auc_mean, auc_std
def train(args, logger): ''' policy ------------ * use original functions only if there's no pre-coded functions in useful libraries such as sklearn. todos ------------ * load features * train the model * save the followings * logs * oofs * importances * trained models * submissions (if test mode) ''' # -- Prepare for training exp_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') train_base_dir = './inputs/train/' configs = load_configs('./config.yml', logger) # -- Load train data sel_log('loading training data ...', None) target = pd.read_pickle(train_base_dir + 'target.pkl.gz', compression='gzip') outliers = pd.read_pickle(train_base_dir + 'outliers.pkl.gz', compression='gzip') # Cache can be used only in train if args.use_cached_features: features_df = pd.read_pickle('./inputs/train/cached_featurse.pkl.gz', compression='gzip') else: if configs['train']['all_features']: _features = get_all_features('./inputs/train/') else: _features = configs['features'] features_df = load_features(_features, train_base_dir, logger) # gen cache file if specified for the next time if args.gen_cached_features: features_df.to_pickle('./inputs/train/cached_featurse.pkl.gz', compression='gzip') # remove invalid features features_df.drop(configs['invalid_features'], axis=1, inplace=True) # remove invalid rows if configs['train']['rm_outliers']: features_df = features_df[outliers == 0] target = target[outliers == 0] features_df # label encoding categorical features sel_log('loading test data ...', None) test_base_dir = './inputs/test/' test_features_df = load_features(features_df.columns, test_base_dir, logger) # feature selection if needed if configs['train']['feature_selection']: features_df = select_features(features_df, configs['train']['feature_select_path'], configs['train']['metric'], configs['train']['feature_topk']) test_features_df = select_features( test_features_df, configs['train']['feature_select_path'], configs['train']['metric'], configs['train']['feature_topk']) features = features_df.columns # clarify the used categorical features # also encoding categorical features if configs['categorical_features']: categorical_features = sorted( list(set(features) & set(configs['categorical_features']))) trn_tst_df = pd.concat([features_df, test_features_df], axis=0) sel_log('label encoding ...', None) trn_tst_df, le_dict = label_encoding(trn_tst_df, fit_columns=categorical_features) features_df = trn_tst_df.iloc[:features_df.shape[0]] test_features_df = trn_tst_df.iloc[features_df.shape[0]:] else: categorical_features = None # categorical_features = get_locs( # features_df, configs['categorical_features']) sel_log(f'the shape features_df is {features_df.shape}', logger) # -- Split using group k-fold w/ shuffling # NOTE: this is not stratified, I wanna implement it in the future if configs['train']['fold_type'] == 'ukf': ukf = UniformKFold(configs['train']['fold_num']) folds = ukf.split(features_df, target) elif configs['train']['fold_type'] == 'skf': skf = StratifiedKFold(configs['train']['fold_num'], random_state=71) folds = skf.split(features_df, outliers) else: print(f"ERROR: wrong fold_type, {configs['train']['fold_type']}") folds, pred_folds = tee(folds) # -- Make training dataset # train_set = mlgb.Dataset(features_df, target, # categorical_feature=categorical_features) # train_set = mlgb.Dataset(features_df.values, target.values,) # feature_name=features, # categorical_feature=configs['categorical_features']) # -- CV # Set params PARAMS = configs['lgbm_params'] PARAMS['nthread'] = args.nthread # PARAMS['categorical_feature'] = categorical_features sel_log('start training ...', None) cv_model = [] for i, idxes in tqdm(list(enumerate(folds))): trn_idx, val_idx = idxes # -- Data resampling # Stock original data for validation if configs['preprocess']['resampling']: trn_idx = resampling(outliers[trn_idx], configs['preprocess']['resampling_type'], configs['preprocess']['resampling_seed'], configs['preprocess']['os_lim']) train_set = lightgbm.Dataset( features_df.iloc[trn_idx], target[trn_idx], ) # categorical_feature=categorical_features) valid_set = lightgbm.Dataset( features_df.iloc[val_idx], target[val_idx], ) # categorical_feature=categorical_features) booster = lightgbm.train( params=PARAMS.copy(), train_set=train_set, num_boost_round=20000, valid_sets=[valid_set, train_set], verbose_eval=100, early_stopping_rounds=200, categorical_feature=categorical_features, callbacks=[log_evaluation(logger, period=100)], ) cv_model.append(booster) # hist, cv_model = mlgb.cv( # params=PARAMS, # num_boost_round=10000, # folds=folds, # train_set=train_set, # verbose_eval=100, # early_stopping_rounds=200, # metrics='rmse', # callbacks=[log_evaluation(logger, period=100)], # ) # -- Prediction if configs['train']['single_model']: best_iter = cv_model.best_iteration single_train_set = lightgbm.Dataset(features_df.values, target.values) single_booster = lightgbm.train( params=PARAMS, num_boost_round=int(best_iter * 1.3), train_set=single_train_set, valid_sets=[single_train_set], verbose_eval=100, early_stopping_rounds=200, callbacks=[log_evaluation(logger, period=100)], ) oofs = [single_booster.predict(features_df.values)] y_trues = [target] val_idxes = [features_df.index] scores = [] y_true, y_pred = target, oofs[0] fold_importance_df = pd.DataFrame() fold_importance_df['split'] = single_booster.\ feature_importance('split') fold_importance_df['gain'] = single_booster.\ feature_importance('gain') fold_importance_dict = {0: fold_importance_df} else: sel_log('predicting using cv models ...', logger) oofs = [] y_trues = [] val_idxes = [] scores = [] outlier_scores = [] non_outlier_scores = [] fold_importance_dict = {} for i, idxes in tqdm(list(enumerate(pred_folds))): trn_idx, val_idx = idxes # booster = cv_model.boosters[i] booster = cv_model[i] # Get and store oof and y_true y_pred = booster.predict(features_df.values[val_idx], num_iteration=None) y_true = target.values[val_idx] oofs.append(y_pred) y_trues.append(y_true) val_idxes.append(val_idx) # Calc RMSE rmse = np.sqrt(mean_squared_error(y_true, y_pred)) scores.append(rmse) fold_outlires = y_true < -30 out_rmse = np.sqrt( mean_squared_error(y_true[fold_outlires], y_pred[fold_outlires])) outlier_scores.append(out_rmse) fold_non_outlires = y_true > -30 non_out_rmse = np.sqrt( mean_squared_error(y_true[fold_non_outlires], y_pred[fold_non_outlires])) non_outlier_scores.append(non_out_rmse) # Save importance info fold_importance_df = pd.DataFrame() fold_importance_df['split'] = booster.feature_importance('split') fold_importance_df['gain'] = booster.feature_importance('gain') fold_importance_dict[i] = fold_importance_df rmse_mean, rmse_std = np.mean(scores), np.std(scores) out_rmse_mean, out_rmse_std = np.mean(outlier_scores), np.std( outlier_scores) non_out_rmse_mean, non_out_rmse_std = np.mean( non_outlier_scores), np.std(non_outlier_scores) sel_log(f'RMSE_mean: {rmse_mean:.4f}, RMSE_std: {rmse_std:.4f}', logger) sel_log( f'OUT_RMSE_mean: {out_rmse_mean:.4f}, OUT_RMSE_std: {out_rmse_std:.4f}', logger) sel_log( f'NON_OUT_RMSE_mean: {non_out_rmse_mean:.4f}, NON_OUT_RMSE_std: {non_out_rmse_std:.4f}', logger) # -- Post processings filename_base = f'{args.exp_ids[0]}_{exp_time}_{rmse_mean:.4}' # Save oofs with open('./oofs/' + filename_base + '_oofs.pkl', 'wb') as fout: pickle.dump([val_idxes, oofs], fout) # Save importances # save_importance(configs['features'], fold_importance_dict, save_importance(features, fold_importance_dict, './importances/' + filename_base + '_importances', topk=100, main_metric='split') # Save trained models with open('./trained_models/' + filename_base + '_models.pkl', 'wb') as fout: pickle.dump( single_booster if configs['train']['single_model'] else cv_model, fout) # --- Make submission file if args.test: # # -- Prepare for test # test_base_dir = './inputs/test/' # # sel_log('loading test data ...', None) # test_features_df = load_features( # features, test_base_dir, logger) # # label encoding # sel_log('encoding categorical features ...', None) # test_features_df = fill_unseens(features_df, test_features_df, # configs['categorical_features'], # args.nthread) # test_features_df, le_dict = label_encoding(test_features_df, le_dict) # -- Prediction sel_log('predicting for test ...', None) preds = [] # for booster in tqdm(cv_model.boosters): for booster in tqdm(cv_model): pred = booster.predict(test_features_df.values, num_iteration=None) preds.append(pred) target_values = np.mean(preds, axis=0) # -- Make submission file sel_log(f'loading sample submission file ...', None) sub_df = pd.read_csv('./inputs/origin/sample_submission.csv.zip', compression='zip') sub_df.target = target_values # print stats submission_filename = f'./submissions/{filename_base}_sub.csv.gz' sel_log(f'saving submission file to {submission_filename}', logger) sub_df.to_csv(submission_filename, compression='gzip', index=False)