'layers': 3, # 샘플을 위해 빨리 끝나도록 설정 'nb_epoch': 5, # 1000 'patience': 10, 'dropout': 0.5, 'units': 512, } # 특징 지정 features = [f'feat_{i}' for i in range(1, 94)] # xgboost에 의한 학습 및 예측 runner = Runner('xgb1', ModelXGB, features, params_xgb) runner.run_train_cv() runner.run_predict_cv() Submission.create_submission('xgb1') # 신경망에 의한 학습 예측 runner = Runner('nn1', ModelNN, features, params_nn) runner.run_train_cv() runner.run_predict_cv() Submission.create_submission('nn1') ''' # (참고)xgboost를 통한 학습 및 예측 - 학습 데이터 전체를 사용하는 경우 runner = Runner('xgb1-train-all', ModelXGB, features, params_xgb_all) runner.run_train_all() runner.run_test_all() Submission.create_submission('xgb1-train-all') '''
use_feature_name = runner.get_feature_name() # 今回の学習で使用する特徴量名を取得 # モデルのconfigをjsonで保存 value_list = [features, use_feature_name, model_params, cv, setting] save_model_config(key_list, value_list, dir_name, run_name) # runner.visualize_corr() # 相関係数を可視化して保存 if cv.get('method') == 'None': runner.run_train_all() # 全データで学習 runner.run_predict_all() # 推論 else: runner.run_train_cv() # 学習 ModelLGB.calc_feature_importance(dir_name, run_name, use_feature_name) # feature_importanceを計算 runner.run_predict_cv() # 推論 Submission.create_submission(run_name, dir_name, setting.get('target')) # submit作成 # ###################################################### # 学習・推論 xgboost ################################### # run nameの設定 run_name = 'xgb' run_name = run_name + suffix dir_name = MODEL_DIR_NAME + run_name + '/' # exist_check(MODEL_DIR_NAME, run_name) # 実行可否確認 my_makedirs(dir_name) # runディレクトリの作成。ここにlogなどが吐かれる # 諸々の設定 setting = {
def main(mode='prd', create_features=True, model_type='lgb') -> str: # confirm('***** mode:{}, create_feature:{} '.format(str(mode), str(create_features))) now = datetime.datetime.now() suffix = now.strftime("_%m%d%H%M") # CVの設定.methodは[KFold, StratifiedKFold ,GroupKFold, StratifiedGroupKFold, CustomTimeSeriesSplitter, TrainTestSplit]から選択可能 # CVしない場合(全データで学習させる場合)はmethodに'None'を設定 # StratifiedKFold or GroupKFold or StratifiedGroupKFold の場合はcv_target_gr, cv_target_sfに対象カラム名を設定する # TrainTestSplitの場合はtest_sizeにtest setの割合を設定する cv = { 'method': 'StratifiedKFold', 'n_splits': 5, 'random_state': 42, 'shuffle': True, 'cv_target_gr': 'chip_id', 'cv_target_sf': 'target' } # run nameの設定 run_name = 'cat' run_name = run_name + '_' + cv.get('method') + suffix + 'depth5_pl' dir_name = MODEL_DIR_NAME + run_name + '/' exist_check(MODEL_DIR_NAME, run_name) my_makedirs(dir_name) # 諸々の設定 setting = { 'run_name': run_name, # run名 'feature_directory': FEATURE_DIR_NAME, # 特徴量の読み込み先ディレクトリ 'target': 'target', # 目的変数 'calc_shap': False, # shap値を計算するか否か 'save_train_pred': True # trainデータでの推論値を保存するか否か(trainデータでの推論値を特徴量として加えたい場合はTrueに設定する) } # モデルのパラメータ model_params = { 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'learning_rate': 0.05, 'iterations': 4000, 'early_stopping_rounds': 400, 'random_seed': 42, 'verbose_eval': 100, 'depth': 5 } features = [ # 'chip_id', 'exc_wl', # 'layout_a', # 'layout_x', # 'layout_y', 'distance', 'distance_x', 'distance_y', 'pos_x', 'params0', 'params1', 'params2', 'params3', 'params4', 'params5', 'params6', # --- TODO: paramsの交互作用特徴量 'params0_multi_rms', 'params0_divid_rms', # 'params0_plus_rms', # 'params0_minus_rms', 'params0_multi_beta', 'params0_divid_beta', # 'params0_plus_beta', # 'params0_minus_beta', 'params1_multi_rms', 'params1_divid_rms', # 'params1_plus_rms', # 'params1_minus_rms', 'params1_multi_beta', 'params1_divid_beta', # 'params1_plus_beta', # 'params1_minus_beta', 'params2_multi_rms', 'params2_divid_rms', # 'params2_plus_rms', # 'params2_minus_rms', 'params2_multi_beta', 'params2_divid_beta', # 'params2_plus_beta', # 'params2_minus_beta', 'params3_multi_rms', 'params3_divid_rms', # 'params3_plus_rms', # 'params3_minus_rms', 'params3_multi_beta', 'params3_divid_beta', # 'params3_plus_beta', # 'params3_minus_beta', 'params4_multi_rms', 'params4_divid_rms', # 'params4_plus_rms', # 'params4_minus_rms', 'params4_multi_beta', 'params4_divid_beta', # 'params4_plus_beta', # 'params4_minus_beta', 'params5_multi_rms', 'params5_divid_rms', # 'params5_plus_rms', # 'params5_minus_rms', 'params5_multi_beta', 'params5_divid_beta', # 'params5_plus_beta', # 'params5_minus_beta', 'params6_multi_rms', 'params6_divid_rms', # 'params6_plus_rms', # 'params6_minus_rms', 'params6_multi_beta', 'params6_divid_beta', # 'params6_plus_beta', # 'params6_minus_beta', 'params0_multi_params1', 'params0_divid_params1', # 'params0_plus_params1', # 'params0_minus_params1', 'params0_multi_params2', 'params0_divid_params2', # 'params0_plus_params2', # 'params0_minus_params2', 'params0_multi_params3', 'params0_divid_params3', # 'params0_plus_params3', # 'params0_minus_params3', 'params0_multi_params4', 'params0_divid_params4', # 'params0_plus_params4', # 'params0_minus_params4', 'params0_multi_params5', 'params0_divid_params5', # 'params0_plus_params5', # 'params0_minus_params5', 'params0_multi_params6', 'params0_divid_params6', # 'params0_plus_params6', # 'params0_minus_params6', 'params1_multi_params0', 'params1_divid_params0', # 'params1_plus_params0', # 'params1_minus_params0', 'params1_multi_params2', 'params1_divid_params2', # 'params1_plus_params2', # 'params1_minus_params2', 'params1_multi_params3', 'params1_divid_params3', # 'params1_plus_params3', # 'params1_minus_params3', 'params1_multi_params4', 'params1_divid_params4', # 'params1_plus_params4', # 'params1_minus_params4', 'params1_multi_params5', 'params1_divid_params5', # 'params1_plus_params5', # 'params1_minus_params5', 'params1_multi_params6', 'params1_divid_params6', # 'params1_plus_params6', # 'params1_minus_params6', 'params2_multi_params0', 'params2_divid_params0', # 'params2_plus_params0', # 'params2_minus_params0', 'params2_multi_params1', 'params2_divid_params1', # 'params2_plus_params1', # 'params2_minus_params1', 'params2_multi_params3', 'params2_divid_params3', # 'params2_plus_params3', # 'params2_minus_params3', 'params2_multi_params4', 'params2_divid_params4', # 'params2_plus_params4', # 'params2_minus_params4', 'params2_multi_params5', 'params2_divid_params5', # 'params2_plus_params5', # 'params2_minus_params5', # 'params2_absminus_params5', 'params2_multi_params6', 'params2_divid_params6', # 'params2_plus_params6', # 'params2_minus_params6', 'params3_multi_params0', 'params3_divid_params0', # 'params3_plus_params0', # 'params3_minus_params0', 'params3_multi_params1', 'params3_divid_params1', # 'params3_plus_params1', # 'params3_minus_params1', 'params3_multi_params2', 'params3_divid_params2', # 'params3_plus_params2', # 'params3_minus_params2', 'params3_multi_params4', 'params3_divid_params4', # 'params3_plus_params4', # 'params3_minus_params4', 'params3_multi_params5', 'params3_divid_params5', # 'params3_plus_params5', # 'params3_minus_params5', 'params3_multi_params6', 'params3_divid_params6', # 'params3_plus_params6', # 'params3_minus_params6', 'params4_multi_params0', 'params4_divid_params0', # 'params4_plus_params0', # 'params4_minus_params0', 'params4_multi_params1', 'params4_divid_params1', # 'params4_plus_params1', # 'params4_minus_params1', 'params4_multi_params2', 'params4_divid_params2', # 'params4_plus_params2', # 'params4_minus_params2', 'params4_multi_params3', 'params4_divid_params3', # 'params4_plus_params3', # 'params4_minus_params3', 'params4_multi_params5', 'params4_divid_params5', # 'params4_plus_params5', # 'params4_minus_params5', 'params4_multi_params6', 'params4_divid_params6', # 'params4_plus_params6', # 'params4_minus_params6', 'params5_multi_params0', 'params5_divid_params0', # 'params5_plus_params0', # 'params5_minus_params0', 'params5_multi_params1', 'params5_divid_params1', # 'params5_plus_params1', # 'params5_minus_params1', 'params5_multi_params2', 'params5_divid_params2', # 'params5_plus_params2', # 'params5_minus_params2', 'params5_multi_params3', 'params5_divid_params3', # 'params5_plus_params3', # 'params5_minus_params3', 'params5_multi_params4', 'params5_divid_params4', # 'params5_plus_params4', # 'params5_minus_params4', 'params5_multi_params6', 'params5_divid_params6', # 'params5_plus_params6', # 'params5_minus_params6', 'params6_multi_params0', 'params6_divid_params0', # 'params6_plus_params0', # 'params6_minus_params0', 'params6_multi_params1', 'params6_divid_params1', # 'params6_plus_params1', # 'params6_minus_params1', 'params6_multi_params2', 'params6_divid_params2', # 'params6_plus_params2', # 'params6_minus_params2', 'params6_multi_params3', 'params6_divid_params3', # 'params6_plus_params3', # 'params6_minus_params3', 'params6_multi_params4', 'params6_divid_params4', # 'params6_plus_params4', # 'params6_minus_params4', 'params6_multi_params5', 'params6_divid_params5', # 'params6_plus_params5', # 'params6_minus_params5', 'rms', 'beta', # --- TODO: スペクトル波長データ 'intensity_max', 'intensity_min', 'intensity_mean', 'intensity_std', 'intensity_sum', 'intensity_median', 'intensity_amplitude_v', # 'intensity_max_log1p', # 'intensity_mean_log1p', # 'intensity_std_log1p', # 'intensity_sum_log1p', # 'intensity_median_log1p', # 'intensity_amplitude_v_log1p', 'intensity_q10', 'intensity_q25', 'intensity_q50', 'intensity_q75', 'intensity_q80', 'intensity_q85', 'intensity_q90', 'intensity_max_minus_q90', 'intensity_max_minus_q85', 'intensity_max_minus_q80', 'intensity_max_minus_q75', 'intensity_max_minus_q50', 'intensity_max_multi_q90', 'intensity_max_multi_q85', 'intensity_max_multi_q80', 'intensity_max_multi_q75', 'intensity_max_multi_q50', 'intensity_max_divid_q90', 'intensity_max_divid_q85', 'intensity_max_divid_q80', 'intensity_max_divid_q75', 'intensity_max_divid_q50', 'intensity_q85_divid_q90', 'intensity_q80_divid_q90', 'intensity_q75_divid_q90', 'intensity_q50_divid_q90', # --- ピーク&半値幅のデータ 'fwhm_09', # 'num_peak_09', 'fwhm_088', # 'num_peak_088', 'fwhm_086', # 'num_peak_086', 'fwhm_084', # 'num_peak_084', 'fwhm_082', # 'num_peak_082', 'fwhm_08', # 'num_peak_08', 'fwhm_078', # 'num_peak_078', 'fwhm_076', # 'num_peak_076', 'fwhm_074', # 'num_peak_074', 'fwhm_072', # 'num_peak_072', 'fwhm_07', # 'num_peak_07', 'fwhm_068', # 'num_peak_068', 'fwhm_066', # 'num_peak_066', 'fwhm_064', # 'num_peak_064', 'fwhm_062', # 'num_peak_062', 'fwhm_06', # 'num_peak_06', 'fwhm_058', # 'num_peak_058', 'fwhm_056', # 'num_peak_056', 'fwhm_054', # 'num_peak_054', 'fwhm_052', # 'num_peak_052', 'fwhm_05', # 'num_peak_05', 'fwhm_divid_num_peak_09', 'fwhm_mult_num_peak_09', 'fwhm_divid_num_peak_088', 'fwhm_mult_num_peak_088', 'fwhm_divid_num_peak_086', 'fwhm_mult_num_peak_086', 'fwhm_divid_num_peak_084', 'fwhm_mult_num_peak_084', 'fwhm_divid_num_peak_082', 'fwhm_mult_num_peak_082', 'fwhm_divid_num_peak_08', 'fwhm_mult_num_peak_08', 'fwhm_divid_num_peak_078', 'fwhm_mult_num_peak_078', 'fwhm_divid_num_peak_076', 'fwhm_mult_num_peak_076', 'fwhm_divid_num_peak_074', 'fwhm_mult_num_peak_074', 'fwhm_divid_num_peak_072', 'fwhm_mult_num_peak_072', 'fwhm_divid_num_peak_07', 'fwhm_mult_num_peak_07', # 'fwhm_divid_num_peak_068', # 'fwhm_mult_num_peak_068', # 'fwhm_divid_num_peak_066', # 'fwhm_mult_num_peak_066', # 'fwhm_divid_num_peak_064', # 'fwhm_mult_num_peak_064', # 'fwhm_divid_num_peak_062', # 'fwhm_mult_num_peak_062', # 'fwhm_divid_num_peak_06', # 'fwhm_mult_num_peak_06', # 'fwhm_divid_num_peak_058', # 'fwhm_mult_num_peak_058', # 'fwhm_divid_num_peak_056', # 'fwhm_mult_num_peak_056', # 'fwhm_divid_num_peak_054', # 'fwhm_mult_num_peak_054', # 'fwhm_divid_num_peak_052', # 'fwhm_mult_num_peak_052', # 'fwhm_divid_num_peak_05', # 'fwhm_mult_num_peak_05', # --- TODO: 波形の圧縮データ 'dc_umap1', 'dc_umap2', 'dc_tsne1', 'dc_tsne2', # --- TODO: カテゴリごとの平均値と自身との差分 # 'diff_chip_id_mean_params0', # 'diff_chip_id_mean_params1', # 'diff_chip_id_mean_params2', # 'diff_chip_id_mean_params3', # 'diff_chip_id_mean_params4', # 'diff_chip_id_mean_params5', # 'diff_chip_id_mean_params6', # 'diff_chip_id_mean_rms', # 'diff_chip_id_mean_beta', # 'diff_chip_id_mean_intensity_amplitude_v', # 'diff_chip_id_mean_fwhm_09', # # 'diff_chip_id_mean_num_peak_09', # 'diff_chip_id_mean_fwhm_088', # # 'diff_chip_id_mean_num_peak_088', # 'diff_chip_id_mean_fwhm_086', # # 'diff_chip_id_mean_num_peak_086', # 'diff_chip_id_mean_fwhm_084', # # 'diff_chip_id_mean_num_peak_084', # 'diff_chip_id_mean_fwhm_082', # # 'diff_chip_id_mean_num_peak_082', # 'diff_chip_id_mean_fwhm_08', # # 'diff_chip_id_mean_num_peak_08', # 'diff_chip_id_mean_fwhm_078', # # 'diff_chip_id_mean_num_peak_078', # 'diff_chip_id_mean_fwhm_076', # # 'diff_chip_id_mean_num_peak_076', # 'diff_chip_id_mean_fwhm_074', # # 'diff_chip_id_mean_num_peak_074', # 'diff_chip_id_mean_fwhm_072', # # 'diff_chip_id_mean_num_peak_072', # 'diff_chip_id_mean_fwhm_07', # # 'diff_chip_id_mean_num_peak_07', # 'diff_chip_id_mean_fwhm_068', # # 'diff_chip_id_mean_num_peak_068', # 'diff_chip_id_mean_fwhm_066', # # 'diff_chip_id_mean_num_peak_066', # 'diff_chip_id_mean_fwhm_064', # # 'diff_chip_id_mean_num_peak_064', # 'diff_chip_id_mean_fwhm_062', # # 'diff_chip_id_mean_num_peak_062', # 'diff_chip_id_mean_fwhm_06', # # 'diff_chip_id_mean_num_peak_06', # 'diff_chip_id_mean_fwhm_058', # # 'diff_chip_id_mean_num_peak_058', # 'diff_chip_id_mean_fwhm_056', # # 'diff_chip_id_mean_num_peak_056', # 'diff_chip_id_mean_fwhm_054', # # 'diff_chip_id_mean_num_peak_054', # 'diff_chip_id_mean_fwhm_052', # # 'diff_chip_id_mean_num_peak_052', # 'diff_chip_id_mean_fwhm_05', # # 'diff_chip_id_mean_num_peak_05', # # 'diff_chip_id_mean_intensity_max', # # 'diff_chip_id_mean_intensity_min', # # 'diff_chip_id_mean_intensity_q90', # # 'diff_chip_id_mean_intensity_q75', 'diff_exc_wl_mean_params0', 'diff_exc_wl_mean_params1', 'diff_exc_wl_mean_params2', 'diff_exc_wl_mean_params3', 'diff_exc_wl_mean_params4', 'diff_exc_wl_mean_params5', 'diff_exc_wl_mean_params6', 'diff_exc_wl_mean_rms', 'diff_exc_wl_mean_beta', 'diff_exc_wl_mean_intensity_amplitude_v', 'diff_exc_wl_mean_fwhm_09', # 'diff_exc_wl_mean_num_peak_09', 'diff_exc_wl_mean_fwhm_088', # 'diff_exc_wl_mean_num_peak_088', 'diff_exc_wl_mean_fwhm_086', # 'diff_exc_wl_mean_num_peak_086', 'diff_exc_wl_mean_fwhm_084', # 'diff_exc_wl_mean_num_peak_084', 'diff_exc_wl_mean_fwhm_082', # 'diff_exc_wl_mean_num_peak_082', 'diff_exc_wl_mean_fwhm_08', # 'diff_exc_wl_mean_num_peak_08', 'diff_exc_wl_mean_fwhm_078', # 'diff_exc_wl_mean_num_peak_078', 'diff_exc_wl_mean_fwhm_076', # 'diff_exc_wl_mean_num_peak_076', 'diff_exc_wl_mean_fwhm_074', # 'diff_exc_wl_mean_num_peak_074', 'diff_exc_wl_mean_fwhm_072', # 'diff_exc_wl_mean_num_peak_072', 'diff_exc_wl_mean_fwhm_07', # 'diff_exc_wl_mean_num_peak_07', 'diff_exc_wl_mean_fwhm_068', # 'diff_exc_wl_mean_num_peak_068', 'diff_exc_wl_mean_fwhm_066', # 'diff_exc_wl_mean_num_peak_066', 'diff_exc_wl_mean_fwhm_064', # 'diff_exc_wl_mean_num_peak_064', 'diff_exc_wl_mean_fwhm_062', # 'diff_exc_wl_mean_num_peak_062', 'diff_exc_wl_mean_fwhm_06', # 'diff_exc_wl_mean_num_peak_06', 'diff_exc_wl_mean_fwhm_058', # 'diff_exc_wl_mean_num_peak_058', 'diff_exc_wl_mean_fwhm_056', # 'diff_exc_wl_mean_num_peak_056', 'diff_exc_wl_mean_fwhm_054', # 'diff_exc_wl_mean_num_peak_054', 'diff_exc_wl_mean_fwhm_052', # 'diff_exc_wl_mean_num_peak_052', 'diff_exc_wl_mean_fwhm_05', # # 'diff_exc_wl_mean_num_peak_05', # # 'diff_exc_wl_mean_intensity_max', # # 'diff_exc_wl_mean_intensity_min', # # 'diff_exc_wl_mean_intensity_q90', # # 'diff_exc_wl_mean_intensity_q75', # 'diff_layout_a_mean_params0', # 'diff_layout_a_mean_params1', # 'diff_layout_a_mean_params2', # 'diff_layout_a_mean_params3', # 'diff_layout_a_mean_params4', # 'diff_layout_a_mean_params5', # 'diff_layout_a_mean_params6', # 'diff_layout_a_mean_rms', # 'diff_layout_a_mean_beta', # 'diff_layout_a_mean_intensity_amplitude_v', # 'diff_layout_a_mean_fwhm_09', # # 'diff_layout_a_mean_num_peak_09', # 'diff_layout_a_mean_fwhm_088', # # 'diff_layout_a_mean_num_peak_088', # 'diff_layout_a_mean_fwhm_086', # # 'diff_layout_a_mean_num_peak_086', # 'diff_layout_a_mean_fwhm_084', # # 'diff_layout_a_mean_num_peak_084', # 'diff_layout_a_mean_fwhm_082', # # 'diff_layout_a_mean_num_peak_082', # 'diff_layout_a_mean_fwhm_08', # # 'diff_layout_a_mean_num_peak_08', # 'diff_layout_a_mean_fwhm_078', # # 'diff_layout_a_mean_num_peak_078', # 'diff_layout_a_mean_fwhm_076', # # 'diff_layout_a_mean_num_peak_076', # 'diff_layout_a_mean_fwhm_074', # # 'diff_layout_a_mean_num_peak_074', # 'diff_layout_a_mean_fwhm_072', # # 'diff_layout_a_mean_num_peak_072', # 'diff_layout_a_mean_fwhm_07', # # 'diff_layout_a_mean_num_peak_07', # 'diff_layout_a_mean_fwhm_068', # # 'diff_layout_a_mean_num_peak_068', # 'diff_layout_a_mean_fwhm_066', # # 'diff_layout_a_mean_num_peak_066', # 'diff_layout_a_mean_fwhm_064', # # 'diff_layout_a_mean_num_peak_064', # 'diff_layout_a_mean_fwhm_062', # # 'diff_layout_a_mean_num_peak_062', # 'diff_layout_a_mean_fwhm_06', # # 'diff_layout_a_mean_num_peak_06', # 'diff_layout_a_mean_fwhm_058', # # 'diff_layout_a_mean_num_peak_058', # 'diff_layout_a_mean_fwhm_056', # # 'diff_layout_a_mean_num_peak_056', # 'diff_layout_a_mean_fwhm_054', # # 'diff_layout_a_mean_num_peak_054', # 'diff_layout_a_mean_fwhm_052', # # 'diff_layout_a_mean_num_peak_052', # 'diff_layout_a_mean_fwhm_05', # # 'diff_layout_a_mean_num_peak_05', # # 'diff_layout_a_mean_intensity_max', # # 'diff_layout_a_mean_intensity_min', # # 'diff_layout_a_mean_intensity_q90', # # 'diff_layout_a_mean_intensity_q75', # --- TODO: tsfreshで生成した特徴 'intensity__ratio_beyond_r_sigma__r_10', 'intensity__ar_coefficient__coeff_2__k_10', 'intensity__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.8', 'intensity__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.6', 'intensity__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.4', 'intensity__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.8', 'intensity__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.2', 'intensity__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.0', 'intensity__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.6', 'intensity__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.4', 'intensity__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.0', 'intensity__cid_ce__normalize_False', 'intensity__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.2', 'intensity__fft_coefficient__attr_"abs"__coeff_55', 'intensity__fft_coefficient__attr_"abs"__coeff_56', 'intensity__large_standard_deviation__r_0.1', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"var"', 'intensity__fft_coefficient__attr_"abs"__coeff_64', 'intensity__fft_coefficient__attr_"abs"__coeff_57', 'intensity__fft_coefficient__attr_"abs"__coeff_51', 'intensity__fft_coefficient__attr_"abs"__coeff_52', 'intensity__fft_coefficient__attr_"abs"__coeff_66', 'intensity__fft_coefficient__attr_"abs"__coeff_68', 'intensity__fft_coefficient__attr_"abs"__coeff_59', 'intensity__fft_coefficient__attr_"abs"__coeff_61', 'intensity__fft_coefficient__attr_"abs"__coeff_53', 'intensity__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.8', 'intensity__fft_coefficient__attr_"abs"__coeff_54', 'intensity__fft_coefficient__attr_"abs"__coeff_65', 'intensity__partial_autocorrelation__lag_2', 'intensity__fft_coefficient__attr_"abs"__coeff_63', 'intensity__fft_coefficient__attr_"abs"__coeff_62', 'intensity__fft_coefficient__attr_"abs"__coeff_71', 'intensity__ar_coefficient__coeff_1__k_10', 'intensity__fft_coefficient__attr_"abs"__coeff_50', 'intensity__fft_coefficient__attr_"abs"__coeff_49', 'intensity__fft_coefficient__attr_"abs"__coeff_48', 'intensity__fft_coefficient__attr_"abs"__coeff_58', 'intensity__fft_coefficient__attr_"abs"__coeff_46', 'intensity__fft_coefficient__attr_"abs"__coeff_69', 'intensity__fft_coefficient__attr_"abs"__coeff_67', 'intensity__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.6', 'intensity__fft_coefficient__attr_"abs"__coeff_47', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"max"', 'intensity__fft_coefficient__attr_"abs"__coeff_60', 'intensity__fft_coefficient__attr_"abs"__coeff_80', 'intensity__fft_coefficient__attr_"abs"__coeff_72', 'intensity__fft_coefficient__attr_"abs"__coeff_76', 'intensity__binned_entropy__max_bins_10', 'intensity__fft_coefficient__attr_"abs"__coeff_70', 'intensity__fft_coefficient__attr_"abs"__coeff_79', 'intensity__ar_coefficient__coeff_3__k_10', 'intensity__fft_coefficient__attr_"abs"__coeff_44', 'intensity__fft_coefficient__attr_"abs"__coeff_78', 'intensity__fft_coefficient__attr_"abs"__coeff_74', 'intensity__fft_coefficient__attr_"abs"__coeff_77', 'intensity__fft_coefficient__attr_"abs"__coeff_45', 'intensity__maximum', 'intensity__fft_coefficient__attr_"abs"__coeff_81', 'intensity__fft_coefficient__attr_"abs"__coeff_75', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"var"', 'intensity__fft_coefficient__attr_"abs"__coeff_43', 'intensity__fft_coefficient__attr_"abs"__coeff_83', 'intensity__fft_coefficient__attr_"abs"__coeff_40', 'intensity__fft_coefficient__attr_"abs"__coeff_82', 'intensity__fft_coefficient__attr_"abs"__coeff_73', 'intensity__fft_coefficient__attr_"abs"__coeff_42', 'intensity__fft_coefficient__attr_"abs"__coeff_41', 'intensity__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.4', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"max"', 'intensity__fft_coefficient__attr_"abs"__coeff_36', 'intensity__fft_coefficient__attr_"abs"__coeff_86', 'intensity__fft_coefficient__attr_"abs"__coeff_39', 'intensity__fft_coefficient__attr_"abs"__coeff_87', 'intensity__index_mass_quantile__q_0.2', 'intensity__fft_coefficient__attr_"abs"__coeff_84', 'intensity__energy_ratio_by_chunks__num_segments_10__segment_focus_2', 'intensity__mean_abs_change', 'intensity__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.0', 'intensity__absolute_sum_of_changes', 'intensity__fft_aggregated__aggtype_"kurtosis"', 'intensity__fft_aggregated__aggtype_"variance"', 'intensity__energy_ratio_by_chunks__num_segments_10__segment_focus_1', 'intensity__fft_coefficient__attr_"abs"__coeff_38', 'intensity__index_mass_quantile__q_0.3', 'intensity__fft_coefficient__attr_"abs"__coeff_37', 'intensity__fft_coefficient__attr_"abs"__coeff_85', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"max"', 'intensity__fft_coefficient__attr_"abs"__coeff_90', 'intensity__fft_coefficient__attr_"abs"__coeff_88', 'intensity__fft_coefficient__attr_"abs"__coeff_92', 'intensity__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.2', 'intensity__fft_coefficient__attr_"abs"__coeff_94', 'intensity__fft_coefficient__attr_"abs"__coeff_89', 'intensity__fft_coefficient__attr_"abs"__coeff_93', 'intensity__fft_coefficient__attr_"abs"__coeff_35', 'intensity__kurtosis', 'intensity__skewness', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_5__f_agg_"var"', 'intensity__fft_coefficient__attr_"abs"__coeff_97', 'intensity__energy_ratio_by_chunks__num_segments_10__segment_focus_3', 'intensity__energy_ratio_by_chunks__num_segments_10__segment_focus_0', 'intensity__spkt_welch_density__coeff_2', 'intensity__fft_coefficient__attr_"abs"__coeff_34', 'intensity__index_mass_quantile__q_0.4', 'intensity__partial_autocorrelation__lag_5', 'intensity__index_mass_quantile__q_0.1', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"var"', 'intensity__ratio_beyond_r_sigma__r_0.5', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_50__f_agg_"max"', 'intensity__fft_coefficient__attr_"abs"__coeff_95', 'intensity__partial_autocorrelation__lag_6', 'intensity__approximate_entropy__m_2__r_0.9', 'intensity__count_below_mean', 'intensity__count_above_mean', 'intensity__fft_coefficient__attr_"abs"__coeff_91', 'intensity__approximate_entropy__m_2__r_0.7', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_10__f_agg_"max"', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_10__f_agg_"var"', 'intensity__approximate_entropy__m_2__r_0.5', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_5__f_agg_"max"', 'intensity__sample_entropy', 'intensity__approximate_entropy__m_2__r_0.3', 'intensity__fft_coefficient__attr_"abs"__coeff_96', 'intensity__fft_coefficient__attr_"imag"__coeff_1', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_50__f_agg_"mean"', 'intensity__linear_trend__attr_"stderr"', 'intensity__linear_trend__attr_"slope"', 'intensity__fft_coefficient__attr_"abs"__coeff_4', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_10__f_agg_"max"', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_5__f_agg_"mean"', 'intensity__fft_coefficient__attr_"abs"__coeff_33', 'intensity__first_location_of_maximum', 'intensity__last_location_of_maximum', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_50__f_agg_"var"', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_10__f_agg_"mean"', 'intensity__fft_coefficient__attr_"abs"__coeff_3', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"mean"', 'intensity__fft_coefficient__attr_"abs"__coeff_32', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_10__f_agg_"mean"', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_5__f_agg_"mean"', 'intensity__standard_deviation', 'intensity__variance', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_5__f_agg_"max"', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_5__f_agg_"min"', 'intensity__linear_trend__attr_"rvalue"', 'intensity__fft_coefficient__attr_"abs"__coeff_6', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_50__f_agg_"max"', 'intensity__energy_ratio_by_chunks__num_segments_10__segment_focus_9', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_5__f_agg_"min"', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_10__f_agg_"var"', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"mean"', 'intensity__fft_coefficient__attr_"abs"__coeff_2', 'intensity__fft_coefficient__attr_"abs"__coeff_99', 'intensity__augmented_dickey_fuller__attr_"usedlag"__autolag_"AIC"', 'intensity__spkt_welch_density__coeff_5', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_10__f_agg_"min"', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_10__f_agg_"min"', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"mean"', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_5__f_agg_"var"', 'intensity__fft_coefficient__attr_"abs"__coeff_31', 'intensity__fft_coefficient__attr_"abs"__coeff_5', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_50__f_agg_"mean"', 'intensity__c3__lag_1', 'intensity__fft_coefficient__attr_"abs"__coeff_7', 'intensity__fft_coefficient__attr_"abs"__coeff_8', 'intensity__fft_coefficient__attr_"abs"__coeff_30', 'intensity__fft_coefficient__attr_"abs"__coeff_9', 'intensity__fft_coefficient__attr_"abs"__coeff_98', 'intensity__ar_coefficient__coeff_7__k_10', 'intensity__ratio_beyond_r_sigma__r_7', 'intensity__ar_coefficient__coeff_6__k_10', 'intensity__fft_coefficient__attr_"abs"__coeff_29', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_50__f_agg_"var"', 'intensity__fft_coefficient__attr_"abs"__coeff_1', 'intensity__variation_coefficient', 'intensity__fft_coefficient__attr_"abs"__coeff_10', 'intensity__max_langevin_fixed_point__m_3__r_30', 'intensity__c3__lag_2', 'intensity__spkt_welch_density__coeff_8', 'intensity__energy_ratio_by_chunks__num_segments_10__segment_focus_8', 'intensity__abs_energy', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"min"', 'intensity__c3__lag_3', 'intensity__fft_coefficient__attr_"abs"__coeff_11', 'intensity__ratio_beyond_r_sigma__r_1', 'intensity__fft_coefficient__attr_"abs"__coeff_13', 'intensity__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"min"', 'intensity__fft_coefficient__attr_"real"__coeff_1', 'intensity__index_mass_quantile__q_0.6', 'intensity__fft_coefficient__attr_"abs"__coeff_27', 'intensity__partial_autocorrelation__lag_7', 'intensity__fft_coefficient__attr_"abs"__coeff_12', 'intensity__agg_linear_trend__attr_"slope"__chunk_len_50__f_agg_"min"', 'intensity__linear_trend__attr_"pvalue"', 'intensity__fft_coefficient__attr_"abs"__coeff_28', 'intensity__agg_linear_trend__attr_"rvalue"__chunk_len_50__f_agg_"min"', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"min"', 'intensity__fft_coefficient__attr_"abs"__coeff_15', 'intensity__fft_coefficient__attr_"abs"__coeff_25', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"min"', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"min"', 'intensity__linear_trend__attr_"intercept"', 'intensity__fft_coefficient__attr_"abs"__coeff_14', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"mean"', 'intensity__fft_aggregated__aggtype_"skew"', 'intensity__fft_coefficient__attr_"abs"__coeff_16', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"', 'intensity__partial_autocorrelation__lag_4', 'intensity__fft_coefficient__attr_"abs"__coeff_26', 'intensity__fft_coefficient__attr_"angle"__coeff_1', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"max"', 'intensity__fft_coefficient__attr_"abs"__coeff_17', 'intensity__index_mass_quantile__q_0.9', 'intensity__fft_coefficient__attr_"abs"__coeff_24', 'intensity__fft_coefficient__attr_"abs"__coeff_18', 'intensity__longest_strike_above_mean', 'intensity__fft_aggregated__aggtype_"centroid"', 'intensity__longest_strike_below_mean', 'intensity__approximate_entropy__m_2__r_0.1', 'intensity__fft_coefficient__attr_"abs"__coeff_22', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"mean"', 'intensity__fft_coefficient__attr_"abs"__coeff_23', 'intensity__cwt_coefficients__coeff_5__w_5__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_4__w_5__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_6__w_5__widths_(2, 5, 10, 20)', 'intensity__sum_of_reoccurring_data_points', 'intensity__partial_autocorrelation__lag_1', 'intensity__autocorrelation__lag_1', 'intensity__ar_coefficient__coeff_8__k_10', 'intensity__fft_coefficient__attr_"imag"__coeff_7', 'intensity__cwt_coefficients__coeff_3__w_5__widths_(2, 5, 10, 20)', 'intensity__sum_of_reoccurring_values', 'intensity__fft_coefficient__attr_"abs"__coeff_20', 'intensity__fft_coefficient__attr_"abs"__coeff_19', 'intensity__ar_coefficient__coeff_4__k_10', 'intensity__cid_ce__normalize_True', 'intensity__fft_coefficient__attr_"abs"__coeff_21', 'intensity__cwt_coefficients__coeff_7__w_5__widths_(2, 5, 10, 20)', 'intensity__energy_ratio_by_chunks__num_segments_10__segment_focus_4', 'intensity__friedrich_coefficients__coeff_3__m_3__r_30', 'intensity__ar_coefficient__coeff_10__k_10', 'intensity__index_mass_quantile__q_0.7', 'intensity__minimum', 'intensity__large_standard_deviation__r_0.15000000000000002', 'intensity__quantile__q_0.1', 'intensity__quantile__q_0.2', 'intensity__fft_coefficient__attr_"real"__coeff_9', 'intensity__cwt_coefficients__coeff_7__w_10__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_8__w_10__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_2__w_5__widths_(2, 5, 10, 20)', 'intensity__quantile__q_0.3', 'intensity__cwt_coefficients__coeff_9__w_10__widths_(2, 5, 10, 20)', 'intensity__quantile__q_0.4', 'intensity__cwt_coefficients__coeff_6__w_10__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_2__w_2__widths_(2, 5, 10, 20)', 'intensity__range_count__max_1000000000000.0__min_0', 'intensity__count_above__t_0', 'intensity__count_below__t_0', 'intensity__cwt_coefficients__coeff_8__w_5__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_10__w_10__widths_(2, 5, 10, 20)', 'intensity__ratio_beyond_r_sigma__r_5', 'intensity__median', 'intensity__cwt_coefficients__coeff_5__w_10__widths_(2, 5, 10, 20)', 'intensity__number_crossing_m__m_0', 'intensity__number_crossing_m__m_1', 'intensity__number_crossing_m__m_-1', 'intensity__fft_coefficient__attr_"imag"__coeff_21', 'intensity__cwt_coefficients__coeff_11__w_10__widths_(2, 5, 10, 20)', 'intensity__quantile__q_0.6', 'intensity__cwt_coefficients__coeff_4__w_10__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_3__w_2__widths_(2, 5, 10, 20)', 'intensity__ar_coefficient__coeff_5__k_10', 'intensity__cwt_coefficients__coeff_12__w_10__widths_(2, 5, 10, 20)', 'intensity__fft_coefficient__attr_"angle"__coeff_11', 'intensity__first_location_of_minimum', 'intensity__range_count__max_1__min_-1', 'intensity__last_location_of_minimum', 'intensity__quantile__q_0.7', 'intensity__fft_coefficient__attr_"angle"__coeff_21', 'intensity__cwt_coefficients__coeff_9__w_5__widths_(2, 5, 10, 20)', 'intensity__number_peaks__n_50', 'intensity__cwt_coefficients__coeff_3__w_10__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_13__w_10__widths_(2, 5, 10, 20)', 'intensity__fft_coefficient__attr_"real"__coeff_61', 'intensity__change_quantiles__f_agg_"mean"__isabs_True__qh_0.4__ql_0.2', 'intensity__value_count__value_1', 'intensity__partial_autocorrelation__lag_9', 'intensity__fft_coefficient__attr_"real"__coeff_51', 'intensity__fft_coefficient__attr_"imag"__coeff_18', 'intensity__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20)', 'intensity__ratio_beyond_r_sigma__r_1.5', 'intensity__cwt_coefficients__coeff_14__w_10__widths_(2, 5, 10, 20)', 'intensity__change_quantiles__f_agg_"var"__isabs_False__qh_0.4__ql_0.2', 'intensity__fft_coefficient__attr_"angle"__coeff_18', 'intensity__cwt_coefficients__coeff_1__w_5__widths_(2, 5, 10, 20)', 'intensity__fft_coefficient__attr_"real"__coeff_8', 'intensity__fft_coefficient__attr_"real"__coeff_22', 'intensity__fft_coefficient__attr_"imag"__coeff_2', 'intensity__fft_coefficient__attr_"imag"__coeff_22', 'intensity__friedrich_coefficients__coeff_2__m_3__r_30', 'intensity__cwt_coefficients__coeff_1__w_20__widths_(2, 5, 10, 20)', 'intensity__cwt_coefficients__coeff_1__w_2__widths_(2, 5, 10, 20)', 'intensity__value_count__value_-1', 'intensity__agg_autocorrelation__f_agg_"median"__maxlag_40', 'intensity__fft_coefficient__attr_"real"__coeff_50', 'intensity__cwt_coefficients__coeff_2__w_20__widths_(2, 5, 10, 20)', 'intensity__fft_coefficient__attr_"imag"__coeff_62', 'intensity__cwt_coefficients__coeff_3__w_20__widths_(2, 5, 10, 20)', 'intensity__fft_coefficient__attr_"real"__coeff_4', 'intensity__fft_coefficient__attr_"angle"__coeff_13', 'intensity__friedrich_coefficients__coeff_0__m_3__r_30', 'intensity__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"max"', 'intensity__cwt_coefficients__coeff_2__w_10__widths_(2, 5, 10, 20)', 'intensity__fft_coefficient__attr_"real"__coeff_2', 'intensity__quantile__q_0.8', 'intensity__autocorrelation__lag_9', 'intensity__fft_coefficient__attr_"real"__coeff_46', 'intensity__number_cwt_peaks__n_1', # --- TODO: camaroさんの特徴量 # 'peak_wave_length', # 'peak_intensity', # 'mean', # 'std', # 'skew', # 'kurtosis', # 'skew_10', # 'kurtosis_10', # 'mean_10', # 'std_10', # 'skew_15', # 'kurtosis_15', # 'mean_15', # 'std_15', # 'skew_20', # 'kurtosis_20', # 'mean_20', # 'std_20', # 'skew_25', # 'kurtosis_25', # 'mean_25', # 'std_25', # 'skew_30', # 'kurtosis_30', # 'mean_30', # 'std_30', # 'skew_50', # 'kurtosis_50', # 'mean_50', # 'std_50', # 'skew_100', # 'kurtosis_100', # 'mean_100', # 'std_100', # 'skew_150', # 'kurtosis_150', # 'mean_150', # 'std_150', # 'skew_200', # 'kurtosis_200', # 'mean_200', # 'std_200', # 'em_ev', # 'ex_ev', # 'ev_diff', # 'ev_ratio', # 'num_peak', # 'num_strong_peak', # 'peak_wave_length2', # 'peak_intensity2', # 'peak_ratio', # 'peak_ratio2' ] try: # インスタンス生成 runner = atm5_Runner(run_name, ModelCB, features, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME) use_feature_name = runner.get_feature_name() # 今回の学習で使用する特徴量名を取得 # モデルのconfigをjsonで保存 value_list = [use_feature_name, model_params, cv, setting] save_model_config(key_list, value_list, dir_name, run_name) runner.run_train_cv() # 学習 runner.run_predict_cv() # 推論 Submission.create_submission(run_name, dir_name, setting.get('target')) # submit作成 print('Done!') except Exception as e: print(traceback.format_exc()) print(f'ERROR:{e}')
save_model_config(key_list, value_list, dir_name, run_name) # 学習 if cv_setting.get('method') == 'None': runner.run_train_all() # 全データで学習 runner.run_predict_all() # 予測 else: runner.run_train_cv() # 学習 ModelLGB.calc_feature_importance( dir_name, run_name, use_feature_name) # feature_importanceを計算 ModelLGB.plot_learning_curve(run_name) # learning curveを描画 runner.run_predict_cv() # 予測 # submissionファイルの作成 lgb_preds = Util.load_df_pickle(dir_name + f'{run_name}-pred.pkl') Submission.create_submission(run_name, dir_name, lgb_preds) # submit作成 # ##### ニューラルネットワーク ########################################################### # features = features # # CV設定の読み込み # cv_setting = get_cv_info(random_state=53) # # run nameの設定 # run_name = get_run_name(cv_setting, model_type="nn") # dir_name = MODEL_DIR_NAME + run_name + '/' # my_makedirs(dir_name) # runディレクトリの作成。ここにlogなどが吐かれる # # ファイルの設定を読み込む
def main(model_type='lgb') -> str: """トレーニングのmain関数 model_typeによって学習するモデルを変更する → lgb, cb, xgb, nnが標準で用意されている Args: model_type (str, optional): どのモデルで学習させるかを指定. Defaults to 'lgb'. Returns: str: [description] Examples: >>> python hoge.py --model_type="lgb" >>> python hoge.py lgb """ cv = get_cv_info() # CVの情報辞書 run_name = get_run_name(cv, model_type) # run名 dir_name = MODEL_DIR_NAME + run_name + '/' # 学習に使用するディレクトリ setting = get_setting_info() # 諸々の設定ファイル辞書 # すでに実行済みのrun名がないかチェックし、ディレクトリを作成する exist_check(MODEL_DIR_NAME, run_name) my_makedirs(dir_name) # モデルに合わせてパラメータを読み込む model_cls = None if model_type == 'lgb': model_params = yml['MODEL_LGB']['PARAM'] model_cls = atm5_ModelLGB elif model_type == 'cb': model_params = yml['MODEL_CB']['PARAM'] model_cls = ModelCB elif model_type == 'xgb': pass elif model_type == 'nn': pass else: print('model_typeが不正なため終了します') sys.exit(0) features = [ 'exc_wl', 'distance', 'distance_x', 'distance_y', 'pos_x', 'params0', 'params1', 'params2', 'params3', 'params4', 'params5', 'params6', 'params0_multi_rms', 'params0_divid_rms', 'params0_multi_beta', 'params0_divid_beta', 'params1_multi_rms', 'params1_divid_rms', 'params1_multi_beta', 'params1_divid_beta', 'params2_multi_rms', 'params2_divid_rms', 'params2_multi_beta', 'params2_divid_beta', 'params3_multi_rms', 'params3_divid_rms', 'params3_multi_beta', 'params3_divid_beta', 'params4_multi_rms', 'params4_divid_rms', 'params4_multi_beta', 'params4_divid_beta', 'params5_multi_rms', 'params5_divid_rms', 'params5_multi_beta', 'params5_divid_beta', 'params6_multi_rms', 'params6_divid_rms', 'params6_multi_beta', 'params6_divid_beta', 'params0_multi_params1', 'params0_divid_params1', 'params0_multi_params2', 'params0_divid_params2', 'params0_multi_params3', 'params0_divid_params3', 'params0_multi_params4', 'params0_divid_params4', 'params0_multi_params5', 'params0_divid_params5', 'params0_multi_params6', 'params0_divid_params6', 'params1_multi_params0', 'params1_divid_params0', 'params1_multi_params2', 'params1_divid_params2', 'params1_multi_params3', 'params1_divid_params3', 'params1_multi_params4', 'params1_divid_params4', 'params1_multi_params5', 'params1_divid_params5', 'params1_multi_params6', 'params1_divid_params6', 'params2_multi_params0', 'params2_divid_params0', 'params2_multi_params1', 'params2_divid_params1', 'params2_multi_params3', 'params2_divid_params3', 'params2_multi_params4', 'params2_divid_params4', 'params2_multi_params5', 'params2_divid_params5', 'params2_multi_params6', 'params2_divid_params6', 'params3_multi_params0', 'params3_divid_params0', 'params3_multi_params1', 'params3_divid_params1', 'params3_multi_params2', 'params3_divid_params2', 'params3_multi_params4', 'params3_divid_params4', 'params3_multi_params5', 'params3_divid_params5', 'params3_multi_params6', 'params3_divid_params6', 'params4_multi_params0', 'params4_divid_params0', 'params4_multi_params1', 'params4_divid_params1', 'params4_multi_params2', 'params4_divid_params2', 'params4_multi_params3', 'params4_divid_params3', 'params4_multi_params5', 'params4_divid_params5', 'params4_multi_params6', 'params4_divid_params6', 'params5_multi_params0', 'params5_divid_params0', 'params5_multi_params1', 'params5_divid_params1', 'params5_multi_params2', 'params5_divid_params2', 'params5_multi_params3', 'params5_divid_params3', 'params5_multi_params4', 'params5_divid_params4', 'params5_multi_params6', 'params5_divid_params6', 'params6_multi_params0', 'params6_divid_params0', 'params6_multi_params1', 'params6_divid_params1', 'params6_multi_params2', 'params6_divid_params2', 'params6_multi_params3', 'params6_divid_params3', 'params6_multi_params4', 'params6_divid_params4', 'params6_multi_params5', 'params6_divid_params5', 'rms', 'beta', # --- TODO: スペクトル波長データ 'intensity_max', 'intensity_min', 'intensity_mean', 'intensity_std', 'intensity_sum', 'intensity_median', 'intensity_amplitude_v', 'intensity_q10', 'intensity_q25', 'intensity_q50', 'intensity_q75', 'intensity_q80', 'intensity_q85', 'intensity_q90', 'intensity_max_minus_q90', 'intensity_max_minus_q85', 'intensity_max_minus_q80', 'intensity_max_minus_q75', 'intensity_max_minus_q50', 'intensity_max_multi_q90', 'intensity_max_multi_q85', 'intensity_max_multi_q80', 'intensity_max_multi_q75', 'intensity_max_multi_q50', 'intensity_max_divid_q90', 'intensity_max_divid_q85', 'intensity_max_divid_q80', 'intensity_max_divid_q75', 'intensity_max_divid_q50', 'intensity_q85_divid_q90', 'intensity_q80_divid_q90', 'intensity_q75_divid_q90', 'intensity_q50_divid_q90', # --- TODO: 波形の圧縮データ 'dc_umap1', 'dc_umap2', 'dc_tsne1', 'dc_tsne2', 'diff_exc_wl_mean_params0', 'diff_exc_wl_mean_params1', 'diff_exc_wl_mean_params2', 'diff_exc_wl_mean_params3', 'diff_exc_wl_mean_params4', 'diff_exc_wl_mean_params5', 'diff_exc_wl_mean_params6', 'diff_exc_wl_mean_rms', 'diff_exc_wl_mean_beta', 'diff_exc_wl_mean_intensity_amplitude_v', 'diff_exc_wl_mean_fwhm_09', ] try: # インスタンス生成 runner = atm5_Runner(run_name, model_cls, features, setting, model_params, cv) use_feature_name = runner.get_feature_name() # 今回の学習で使用する特徴量名を取得 # モデルのconfigをjsonで保存 value_list = [use_feature_name, model_params, cv, setting] save_model_config(key_list, value_list, dir_name, run_name) # 学習・推論 runner.run_train_cv() runner.run_predict_cv() # submit作成 Submission.create_submission(run_name, dir_name, setting.get('target')) if model_type == 'lgb': # feature_importanceを計算 atm5_ModelLGB.calc_feature_importance(dir_name, run_name, use_feature_name, cv.get('n_splits'), type='gain') except Exception as e: print(traceback.format_exc()) print(f'ERROR:{e}')
params_nn = { "layers": 3, # サンプルのため早く終わるように設定 "nb_epoch": 5, # 1000 "patience": 10, "dropout": 0.5, "units": 512, } # 特徴量の指定 features = [f"feat_{i}" for i in range(1, 94)] # xgboostによる学習・予測 runner = Runner("xgb1", ModelXGB, features, params_xgb) runner.run_train_cv() runner.run_predict_cv() Submission.create_submission("xgb1") # ニューラルネットによる学習・予測 runner = Runner("nn1", ModelNN, features, params_nn) runner.run_train_cv() runner.run_predict_cv() Submission.create_submission("nn1") """ # (参考)xgboostによる学習・予測 - 学習データ全体を使う場合 runner = Runner('xgb1-train-all', ModelXGB, features, params_xgb_all) runner.run_train_all() runner.run_test_all() Submission.create_submission('xgb1-train-all') """
def main(mode='prd', create_features=True, model_type='lgb', is_kernel=False) -> str: confirm('mode:{}, create_feature:{} '.format(str(mode), str(create_features))) if create_features: # データ生成 train, test, specs, train_labels, submission = read_data_all(mode) features_train, features_test, win_code, list_of_user_activities, list_of_event_code, \ activities_labels, assess_titles, list_of_event_id, all_title_event_code, all_type_world = encode_title(train, test) del train, test gc.collect() features_train = features_train.merge(specs, how='left', on='event_id', suffixes=('','_y')) features_test = features_test.merge(specs, how='left', on='event_id', suffixes=('','_y')) features_train, features_test = get_train_and_test(features_train, features_test, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, all_type_world, is_kernel) reduce_train, reduce_test, _ = preprocess(features_train, features_test, assess_titles) # user属性情報の生成とマージ """ スコア悪くなるので一旦コメント train, train_session_master = add_session_order_to_train(train, train_labels) user_profiles_train = create_user_profile_train(train) user_profiles_test = create_user_profile_test(test) train_session_master = train_session_master.merge(user_profiles_train, how='left', on=['installation_id', 'session_order']) reduce_train = reduce_train.merge(train_session_master, how='left', on=['installation_id', 'game_session']) reduce_test = reduce_test.merge(user_profiles_test, how='left', on='installation_id') """ del features_train, features_test, _ gc.collect() # 不要なカラムの削除 # cols_to_drop = ['game_session', 'installation_id', 'timestamp', 'session_order', 'accuracy_group', 'timestampDate'] + ['acc_' + title for title in assess_titles] # installation_idでGroupKFoldしない場合はこちらを使用 cols_to_drop = ['game_session', 'timestamp', 'session_order', 'accuracy_group', 'timestampDate'] + ['acc_' + title for title in assess_titles] cols_to_drop = [col for col in cols_to_drop if col in reduce_train.columns] X_train = reduce_train.drop(cols_to_drop, axis=1) X_train.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns] # カラム名にカンマなどが含まれており、lightgbmでエラーが出るため y_train = reduce_train['accuracy_group'] cols_to_drop = [col for col in cols_to_drop if col in reduce_test.columns] X_test = reduce_test.drop(cols_to_drop, axis=1) X_test.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_test.columns] # カラム名にカンマなどが含まれており、lightgbmでエラーが出るため # 特徴量選択 """ スコア悪くなるので一旦コメント to_exclude, X_test = select_ajusted(X_train, X_test) X_train = X_train.drop(to_exclude, axis=1) X_test = X_test.drop(to_exclude, axis=1) """ X_train.to_pickle(FEATURE_DIR_NAME + 'X_train.pkl') y_train.to_pickle(FEATURE_DIR_NAME + 'y_train.pkl') X_test.to_pickle(FEATURE_DIR_NAME + 'X_test.pkl') # CVの設定.methodは[KFold, StratifiedKFold ,GroupKFold]から選択可能 # CVしない場合(全データで学習させる場合)はmethodに'None'を設定 # StratifiedKFold or GroupKFoldの場合は、cv_targetに対象カラム名を設定する cv = { 'method': 'GroupKFold', 'n_splits': 5, 'random_state': 42, 'shuffle': True, 'cv_target': 'installation_id' } if model_type == 'lgb' or model_type == 'all': # ###################################################### # 学習・推論 LightGBM ################################### # run nameの設定 run_name = 'lgb' run_name = run_name + suffix dir_name = MODEL_DIR_NAME + run_name + '/' exist_check(MODEL_DIR_NAME, run_name) my_makedirs(dir_name) # runディレクトリの作成。ここにlogなどが吐かれる # 諸々の設定 setting = { 'run_name': run_name, # run名 'feature_directory': FEATURE_DIR_NAME, # 特徴量の読み込み先ディレクトリ 'target': 'accuracy_group', # 目的変数 'calc_shap': False, # shap値を計算するか否か 'save_train_pred': False # trainデータでの推論値を保存するか否か(trainデータでの推論値を特徴量として加えたい場合はTrueに設定する) } # モデルのパラメータ model_params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'subsample': 0.75, 'subsample_freq': 1, 'feature_fraction': 0.9, 'max_depth': 15, 'lambda_l1': 1, 'lambda_l2': 1, 'num_round': 50000, 'early_stopping_rounds': 300, 'verbose': -1, 'verbose_eval': 300, 'random_state': 999 } if is_kernel: runner = Runner(run_name, ModelLGB, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME, X_train, y_train, X_test) else: runner = Runner(run_name, ModelLGB, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME) use_feature_name = runner.get_feature_name() # 今回の学習で使用する特徴量名を取得 # モデルのconfigをjsonで保存 value_list = [use_feature_name, model_params, cv, setting] save_model_config(key_list, value_list, dir_name, run_name) if cv.get('method') == 'None': # TODO: こちらも動くように修正する runner.run_train_all() # 全データで学習 runner.run_predict_all() # 推論 else: runner.run_train_cv() # 学習 ModelLGB.calc_feature_importance(dir_name, run_name, use_feature_name) # feature_importanceを計算 _pred = runner.run_predict_cv(is_kernel) # 推論 if is_kernel: # kaggleカーネル実行 if model_type == 'lgb': # シングルモデルでのcsv作成 submission[setting.get('target')] = _pred.astype(int) submission.to_csv('submission.csv', index=False) else: # ブレンドするためのcsv作成 submission_lgb = submission.copy() submission_lgb[setting.get('target')] = _pred.astype(int) submission_lgb.to_csv('submission_lgb.csv', index=False) else: # ローカルでの実行 Submission.create_submission(run_name, dir_name, setting.get('target')) # submit作成 if model_type == 'cb' or model_type == 'all': # ###################################################### # 学習・推論 Catboost ################################### # run nameの設定 run_name = 'cb' run_name = run_name + suffix dir_name = MODEL_DIR_NAME + run_name + '/' exist_check(MODEL_DIR_NAME, run_name) my_makedirs(dir_name) # runディレクトリの作成。ここにlogなどが吐かれる # 諸々の設定 setting = { 'run_name': run_name, # run名 'feature_directory': FEATURE_DIR_NAME, # 特徴量の読み込み先ディレクトリ 'target': 'accuracy_group', # 目的変数 'calc_shap': False, # shap値を計算するか否か 'save_train_pred': False # trainデータでの推論値を保存するか否か(trainデータでの推論値を特徴量として加えたい場合はTrueに設定する) } # モデルのパラメータ model_params = { 'loss_function': 'RMSE', 'task_type': "CPU", 'iterations': 50000, 'od_type': "Iter", 'depth': 10, 'colsample_bylevel': 0.5, 'early_stopping_rounds': 300, 'l2_leaf_reg': 18, 'random_seed': 42, 'verbose_eval': 300, 'use_best_model': True } if is_kernel: runner = Runner(run_name, ModelCB, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME, X_train, y_train, X_test) else: runner = Runner(run_name, ModelCB, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME) use_feature_name = runner.get_feature_name() # 今回の学習で使用する特徴量名を取得 # モデルのconfigをjsonで保存 value_list = [use_feature_name, model_params, cv, setting] save_model_config(key_list, value_list, dir_name, run_name) if cv.get('method') == 'None': # TODO: こちらも動くように修正する runner.run_train_all() # 全データで学習 runner.run_predict_all() # 推論 else: runner.run_train_cv() # 学習 _pred = runner.run_predict_cv(is_kernel) # 推論 if is_kernel: # kaggleカーネル実行 if model_type == 'cb': # シングルモデルでのcsv作成 submission[setting.get('target')] = _pred.astype(int) submission.to_csv('submission.csv', index=False) else: # ブレンドするためのcsv作成 submission_cb = submission.copy() submission_cb[setting.get('target')] = _pred.astype(int) submission_cb.to_csv('submission_cb.csv', index=False) else: # ローカルでの実行 Submission.create_submission(run_name, dir_name, setting.get('target')) # submit作成 if model_type == 'nn' or model_type == 'all': # ###################################################### # 学習・推論 NN(MLP) ################################### # run nameの設定 run_name = 'nn' run_name = run_name + suffix dir_name = MODEL_DIR_NAME + run_name + '/' exist_check(MODEL_DIR_NAME, run_name) my_makedirs(dir_name) # runディレクトリの作成。ここにlogなどが吐かれる # 諸々の設定 setting = { 'run_name': run_name, # run名 'feature_directory': FEATURE_DIR_NAME, # 特徴量の読み込み先ディレクトリ 'target': 'accuracy_group', # 目的変数 'calc_shap': False, # shap値を計算するか否か 'save_train_pred': False # trainデータでの推論値を保存するか否か(trainデータでの推論値を特徴量として加えたい場合はTrueに設定する) } # モデルのパラメータ model_params = { 'layers': 4, 'nb_epoch': 500, 'patience': 20, 'dropout': 0.3, 'units': 512, 'classes': 1 } if is_kernel: runner = Runner(run_name, ModelNN, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME, X_train, y_train, X_test) else: runner = Runner(run_name, ModelNN, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME) use_feature_name = runner.get_feature_name() # 今回の学習で使用する特徴量名を取得 # モデルのconfigをjsonで保存 value_list = [use_feature_name, model_params, cv, setting] save_model_config(key_list, value_list, dir_name, run_name) # one-hot-encoding if len(runner.categoricals) > 0: one_hot_encoder = ce.OneHotEncoder(cols=runner.categoricals, drop_invariant=True) one_hot_encoder.fit(runner.train_x[runner.categoricals]) ohe_path = os.path.join('.', 'one-hot-enc.pkl') Util.dump(one_hot_encoder, ohe_path) if cv.get('method') == 'None': # TODO: こちらも動くように修正する runner.run_train_all() # 全データで学習 runner.run_predict_all() # 推論 else: runner.run_train_cv() # 学習 _pred = runner.run_predict_cv(is_kernel) # 推論 if is_kernel: # kaggleカーネル実行 if model_type == 'nn': # シングルモデルでのcsv作成 submission[setting.get('target')] = _pred.astype(int) submission.to_csv('submission.csv', index=False) else: # ブレンドするためのcsv作成 submission_nn = submission.copy() submission_nn[setting.get('target')] = _pred.astype(int) submission_nn.to_csv('submission_nn.csv', index=False) else: # ローカルでの実行 Submission.create_submission(run_name, dir_name, setting.get('target')) # submit作成 if model_type == 'xgb' or model_type == 'all': # ###################################################### # 学習・推論 xgboost ################################### # run nameの設定 run_name = 'xgb' run_name = run_name + suffix dir_name = MODEL_DIR_NAME + run_name + '/' exist_check(MODEL_DIR_NAME, run_name) my_makedirs(dir_name) # runディレクトリの作成。ここにlogなどが吐かれる # 諸々の設定 setting = { 'run_name': run_name, # run名 'feature_directory': FEATURE_DIR_NAME, # 特徴量の読み込み先ディレクトリ 'target': 'accuracy_group', # 目的変数 'calc_shap': False, # shap値を計算するか否か 'save_train_pred': False # trainデータでの推論値を保存するか否か(trainデータでの推論値を特徴量として加えたい場合はTrueに設定する) } # モデルのパラメータ model_params = { 'objective':'reg:squarederror', 'eval_metric': 'rmse', 'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 10, 'subsample': 1, 'min_child_weight':3, 'gamma':0.25, 'num_round': 50000, 'early_stopping_rounds': 300, 'verbose': 300, 'random_state': 999 } if is_kernel: runner = Runner(run_name, ModelXGB, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME, X_train, y_train, X_test) else: runner = Runner(run_name, ModelXGB, setting, model_params, cv, FEATURE_DIR_NAME, MODEL_DIR_NAME) use_feature_name = runner.get_feature_name() # 今回の学習で使用する特徴量名を取得 # モデルのconfigをjsonで保存 value_list = [use_feature_name, model_params, cv, setting] save_model_config(key_list, value_list, dir_name, run_name) if cv.get('method') == 'None': # TODO: こちらも動くように修正する runner.run_train_all() # 全データで学習 runner.run_predict_all() # 推論 else: runner.run_train_cv() # 学習 ModelXGB.calc_feature_importance(dir_name, run_name, use_feature_name) # feature_importanceを計算 _pred = runner.run_predict_cv(is_kernel) # 推論 if is_kernel: # kaggleカーネル実行 if model_type == 'xgb': # シングルモデルでのcsv作成 submission[setting.get('target')] = _pred.astype(int) submission.to_csv('submission.csv', index=False) else: # ブレンドするためのcsv作成 submission_xgb = submission.copy() submission_xgb[setting.get('target')] = _pred.astype(int) submission_xgb.to_csv('submission_xgb.csv', index=False) else: # ローカルでの実行 Submission.create_submission(run_name, dir_name, setting.get('target')) # submit作成 # 推論のブレンド # TODO: xbgの結果も入れる if model_type == 'all' and is_kernel: weights = {'lgb': 0.30, 'cb': 0.40, 'nn': 0.00, 'xgb': 0.30} blend_pred = (submission_lgb[setting.get('target')] * weights['lgb']) \ + (submission_cb[setting.get('target')] * weights['cb']) \ + (submission_nn[setting.get('target')] * weights['nn']) \ + (submission_xgb[setting.get('target')] * weights['xgb']) dist = Counter(reduce_train[setting.get('target')]) for k in dist: dist[k] /= len(reduce_train) acum = 0 bound = {} for i in range(3): acum += dist[i] bound[i] = np.percentile(blend_pred, acum * 100) def classify(x): if x <= bound[0]: return 0 elif x <= bound[1]: return 1 elif x <= bound[2]: return 2 else: return 3 blend_pred = np.array(list(map(classify, blend_pred))) submission[setting.get('target')] = blend_pred.astype(int) submission.to_csv('submission.csv', index=False) return 'Success!'