def main(): #======================================================================== # Data Load #======================================================================== win_path_list = glob.glob(win_path) train_path_list = [] test_path_list = [] for path in win_path_list: if path.count('train'): train_path_list.append(path) elif path.count('test'): test_path_list.append(path) # train_feature_list = utils.pararell_load_data(path_list=train_path_list, delimiter='gz') # test_feature_list = utils.pararell_load_data(path_list=test_path_list, delimiter='gz') # train = pd.concat(train_feature_list, axis=1) # test = pd.concat(test_feature_list, axis=1) df = utils.read_df_pkl('../input/appli*') train = df[df[target] >= 0] test = df[df[target] == -1] metric = 'auc' fold = 5 fold_type = 'stratified' group_col_name = '' dummie = 1 oof_flg = True LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list) train, _ = LGBM.data_check(df=train) test, drop_list = LGBM.data_check(df=test, test_flg=True) if len(drop_list): train.drop(drop_list, axis=1, inplace=True) test.drop(drop_list, axis=1, inplace=True) #======================================================================== # Train & Prediction Start #======================================================================== LGBM = LGBM.cross_prediction(train=train, test=test, key=key, target=target, fold_type=fold_type, fold=fold, group_col_name=group_col_name, params=params, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, oof_flg=oof_flg) #======================================================================== # Result #======================================================================== cv_score = LGBM.cv_score result = LGBM.prediction cv_feim = LGBM.cv_feim feature_num = len(LGBM.use_cols) cv_feim.to_csv( f'../valid/{start_time[4:12]}_{model_type}_{fname}_feat{feature_num}_CV{cv_score}_lr{learning_rate}.csv' ) #======================================================================== # X-RAYの計算と出力 # Args: # model : 学習済のモデル # train : モデルの学習に使用したデータセット # col_list : X-RAYの計算を行うカラムリスト。指定なしの場合、 # データセットの全カラムについて計算を行うが、 # 計算時間を考えると最大30カラム程度を推奨。 #======================================================================== xray = False if xray: train.reset_index(inplace=True) train = train[LGBM.use_cols] result_xray = pd.DataFrame() N_sample = 150000 max_point = 30 for fold_num in range(fold): model = LGBM.fold_model_list[fold_num] if fold_num == 0: xray_obj = Xray_Cal(logger=logger, ignore_list=ignore_list, model=model) xray_obj, tmp_xray = xray_obj.get_xray(base_xray=train, col_list=train.columns, fold_num=fold_num, N_sample=N_sample, max_point=max_point) tmp_xray.rename(columns={'xray': f'xray_{fold_num}'}, inplace=True) if len(result_xray): result_xray.merge(tmp_xray.drop('N', axis=1), on=['feature', 'value'], how='inner') else: result_xray = tmp_xray.copy() del tmp_xray gc.collect() xray_col = [col for col in result_xray.columns if col.count('xray')] result_xray['xray_avg'] = result_xray[xray_col].mean(axis=1) result_xray.to_csv( f'../output/{start_time[4:10]}_xray_{model_type}_CV{LGBM.cv_score}.csv' ) sys.exit() submit = pd.read_csv('../input/sample_submission.csv') # submit = [] #======================================================================== # STACKING #======================================================================== if len(stack_name) > 0: logger.info(f'result_stack shape: {LGBM.result_stack.shape}') utils.to_pkl( path= f"../stack/{start_time[4:12]}_{stack_name}_{model_type}_CV{str(cv_score).replace('.', '-')}_{feature_num}features.fp", obj=LGBM.result_stack) logger.info( f'FEATURE IMPORTANCE PATH: {HOME}/kaggle/home-credit-default-risk/output/cv_feature{feature_num}_importances_auc_{cv_score}.csv' ) #======================================================================== # Submission #======================================================================== if len(submit) > 0: if stack_name == 'add_nest': test[target] = result test = test.reset_index()[[ key, target ]].groupby(key)[target].mean().reset_index() submit = submit[key].to_frame().merge(test, on=key, how='left') submit[target].fillna(0, inplace=True) submit.to_csv( f'../submit/{start_time[4:12]}_submit_{fname}_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv', index=False) else: submit[target] = result submit.to_csv( f'../submit/{start_time[4:12]}_submit_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv', index=False)
# test = pd.concat([base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)], axis=1) test = [] #======================================================================== # LGBM Setting seed = 1208 metric = 'rmse' fold_type = 'self' group_col_name = '' dummie = 1 oof_flg = True #======================================================================== # Preprocessing LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list) # train, test, drop_list = LGBM.data_check(train=train, test=test, target=target) train, test, drop_list = LGBM.data_check(train=train, test=[], target=target) if len(drop_list): train.drop(drop_list, axis=1, inplace=True) # test.drop(drop_list, axis=1, inplace=True) #======================================================================== #======================================================================== # Increase Valid Features valid_feat_list = [''] + glob.glob('../features/1_first_valid/*.gz') #======================================================================== from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold
def objective(trial): # subsample = trial.suggest_uniform('subsample', 0.9, 0.98) colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.20, 0.33) num_leaves = trial.suggest_int('num_leaves', 54, 73) # max_depth = trial.suggest_int('max_depth', 8, 12) min_child_samples = trial.suggest_int('min_child_samples', 30, 75) lambda_l2 = trial.suggest_int('lambda_l2', 3.0, 15.0) params = { # 'num_threads': -1, 'num_threads': 32, 'num_leaves': num_leaves, 'objective': 'regression', "boosting": "gbdt", # 'max_depth': max_depth, 'max_depth': -1, 'learning_rate': learning_rate, "min_child_samples": min_child_samples, "bagging_freq": 1, # "subsample": subsample , "subsample": 0.9, "colsample_bytree": colsample_bytree, # "colsample_bytree": 0.9, "metric": 'rmse', "lambda_l1": 0.1, "lambda_l2": lambda_l2, # "lambda_l2": 0.1, "verbosity": -1, 'random_seed': seed, 'bagging_seed': seed, 'feature_fraction_seed': seed, 'data_random_seed': seed } LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list) LGBM.seed = seed # train['outliers'] = train[target].map(lambda x: 1 if x<-30 else 0) # folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) # kfold = folds.split(train,train['outliers'].values) # train.drop('outliers', axis=1, inplace=True) #======================================================================== # Train & Prediction Start #======================================================================== LGBM = LGBM.cross_validation( train=train, key=key, target=target, fold_type=fold_type, fold=fold, group_col_name=group_col_name, params=params, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, self_kfold=kfold # ,self_stop=thres_score_list , params_tune=True) cv_score = LGBM.cv_score pred_val = LGBM.prediction df_pred = train.reset_index()[key].to_frame() df_pred['prediction'] = pred_val # outlierに対するスコアを出す # from sklearn.metrics import mean_squared_error # train.reset_index(inplace=True) # out_ids = train.loc[train.target<-30, key].values # out_val = train.loc[train.target<-30, target].values # out_pred = df_pred[df_pred[key].isin(out_ids)]['prediction'].values # out_score = np.sqrt(mean_squared_error(out_val, out_pred)) # out_list.append(out_score) # if len(out_list)%10==0: # if len(out_list)>=10: # print(out_list[-10:]) # else: # print(out_list) # # outlier以外に対するスコアを出す # com_ids = train.loc[train.target>-30, key].values # com_val = train.loc[train.target>-30, target].values # com_pred = df_pred[df_pred[key].isin(com_ids)]['prediction'].values # com_score = np.sqrt(mean_squared_error(com_val, com_pred)) # com_list.append(com_score) # com_score -= 1.8404775225287757 logger.info(f''' #======================================================================== # CV SCORE: {cv_score} #========================================================================''' ) # if com_score<0: # out_score += com_score*-2 # # スコア経過のログ # LGBM.val_score_list.append(cv_score) # LGBM.val_score_list.append(params) # tmp = pd.Series(LGBM.val_score_list) # valid_list.append(tmp.copy()) return cv_score