示例#1
0
def get_input_data(add_new_label=True):
    # Load training/testing ids and depths
    train_df = pd.read_csv("../input/train.csv", index_col="id", usecols=[0])
    depths_df = pd.read_csv("../input/depths.csv", index_col="id")
    train_df = train_df.join(depths_df)
    test_df = depths_df[~depths_df.index.isin(train_df.index)]
    print("train shape: {}, test shape: {}".format(train_df.shape,
                                                   test_df.shape))

    # Read images and masks
    print("loading images....")
    train_df["images"] = [
        np.array(
            load_img("../input/train/images/{}.png".format(idx),
                     color_mode="grayscale")) / 255
        for idx in tqdm(train_df.index)
    ]

    print("loading masks....")
    train_df["masks"] = [
        np.array(
            load_img("../input/train/masks/{}.png".format(idx),
                     color_mode="grayscale")) / 255
        for idx in tqdm(train_df.index)
    ]

    # Calculating the salt coverage and salt coverage classes
    train_df["coverage"] = train_df.masks.map(np.sum) / pow(IMG_SIZE_ORI, 2)
    train_df["coverage_class"] = train_df.coverage.map(cov_to_class)

    # add new labels
    if add_new_label == True:
        train_df, test_df = get_binary_labels(train_df, test_df, NUM_FOLDS)
    else:
        pass

    # train_dfとtest_dfをsaveする処理
    save2pkl('../output/train_df.pkl', train_df)
    save2pkl('../output/test_df.pkl', test_df)

    # 完了後にLINE通知を送信
    line_notify(
        'Preprocessing.py finished. train shape: {}, test shape: {}'.format(
            train_df.shape, test_df.shape))

    return train_df, test_df
示例#2
0
def main():
    # reg for bayesian optimization
    reg_bo = BayesianOptimization(
        xgb_eval, {
            'gamma': (0, 1),
            'max_depth': (6, 6),
            'min_child_weight': (0, 45),
            'subsample': (0.001, 1),
            'colsample_bytree': (0.001, 1),
            'colsample_bylevel': (0.001, 1),
            'alpha': (9, 20),
            '_lambda': (0, 10)
        })

    reg_bo.maximize(init_points=15, n_iter=25)

    res = pd.DataFrame(reg_bo.res['max']['max_params'], index=['max_params'])

    res.to_csv('../output/max_params_xgb_session.csv')

    line_notify('xgb session finished.')
def main():

    # clf for bayesian optimization
    clf_bo = BayesianOptimization(
        lgbm_eval, {
            'num_leaves': (16, 64),
            'colsample_bytree': (0.001, 1),
            'subsample': (0.001, 1),
            'max_depth': (3, 8),
            'reg_alpha': (0, 10),
            'reg_lambda': (0, 10),
            'min_split_gain': (0, 1),
            'min_child_weight': (0, 45),
            'min_data_in_leaf': (0, 500),
        })

    clf_bo.maximize(init_points=15, n_iter=25)

    res = pd.DataFrame(clf_bo.res['max']['max_params'], index=['max_params'])

    res.to_csv('../output/max_params_lgbm_user.csv')

    line_notify('lgbm user finished.')
def kfold_xgboost(df, num_folds, stratified=False, debug=False, use_pkl=False):

    # Divide in training/validation and test data
    train_df = df[~df['IS_TEST']]
    test_df = df[df['IS_TEST']]

    print("Starting XGBoost. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()

    ############################################################################
    # Session Level predictions
    ############################################################################

    print('Starting Session Level predictions...')

    # Cross validation model
    folds_session = get_folds(df=train_df, n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds_session = np.zeros(train_df.shape[0])
    sub_preds_session = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns
        if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue']
    ]

    # final predict用にdmatrix形式のtest dfを作っておきます
    test_df_dmtrx = xgb.DMatrix(test_df[feats],
                                label=train_df['totals.transactionRevenue'])

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds_session):
        train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(
            train_df['totals.transactionRevenue'].iloc[train_idx])
        valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(
            train_df['totals.transactionRevenue'].iloc[valid_idx])

        # set data structure
        xgb_train = xgb.DMatrix(train_x, label=train_y)
        xgb_test = xgb.DMatrix(valid_x, label=valid_y)

        # params
        params = {
            'objective': 'gpu:reg:linear',  # GPU parameter
            'booster': 'gbtree',
            'eval_metric': 'rmse',
            'silent': 1,
            'eta': 0.01,
            'max_depth': 6,
            'min_child_weight': 19,
            'gamma': 0.479411416192221,
            'subsample': 0.976329169063721,
            'colsample_bytree': 0.921410871323335,
            'colsample_bylevel': 0.603858358771505,
            'alpha': 9.86942860885701,
            'lambda': 9.63581598065735,
            'tree_method': 'gpu_hist',  # GPU parameter
            'predictor': 'gpu_predictor',  # GPU parameter
            'seed': int(2**n_fold)
        }

        reg = xgb.train(params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_train, 'train'), (xgb_test, 'test')],
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        reg.save_model('../output/models/xgb_session_' + str(n_fold) + '.txt')

        oof_preds_session[valid_idx] = np.expm1(reg.predict(xgb_test))
        sub_preds_session += np.expm1(reg.predict(test_df_dmtrx)) / num_folds

        fold_importance_df = pd.DataFrame.from_dict(
            reg.get_score(importance_type='gain'),
            orient='index',
            columns=['importance'])
        fold_importance_df["feature"] = fold_importance_df.index.tolist()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' %
              (n_fold + 1, rmse(valid_y, np.log1p(
                  oof_preds_session[valid_idx]))))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    del test_df_dmtrx
    gc.collect()

    # Full RMSEスコアの表示&LINE通知
    full_rmse_session = rmse(np.log1p(train_df['totals.transactionRevenue']),
                             np.log1p(oof_preds_session))
    line_notify('XGBoost Session Level Full RMSE score %.6f' %
                full_rmse_session)

    # session level feature importance
    display_importances(feature_importance_df,
                        '../output/xgb_importances_session.png',
                        '../output/feature_importance_xgb_session.csv')

    # 予測値を保存
    train_df.loc[:, 'predictions'] = oof_preds_session
    test_df.loc[:, 'predictions'] = sub_preds_session

    del oof_preds_session, sub_preds_session
    gc.collect()

    # csv形式でsave
    train_df['predictions'].to_csv("../output/oof_xgb_session.csv")
    test_df['predictions'].to_csv("../output/sub_xgb_session.csv")

    ############################################################################
    # User Level predictions
    ############################################################################

    print('Starting User Level predictions...')

    if use_pkl:

        del train_df, test_df
        gc.collect()

        # load pkl
        train_df_agg = read_pickles('../output/train_df_agg_xgb')
        test_df_agg = read_pickles('../output/test_df_agg_xgb')
    else:
        # Aggregate data at User level
        aggregations = {'totals.transactionRevenue': ['sum']}
        for col in feats + ['predictions']:
            aggregations[col] = ['sum', 'max', 'min', 'mean']

        train_df_agg = train_df[
            feats +
            ['fullVisitorId', 'totals.transactionRevenue', 'predictions'
             ]].groupby('fullVisitorId').agg(aggregations)
        del train_df
        gc.collect()

        test_df_agg = test_df[
            feats +
            ['fullVisitorId', 'totals.transactionRevenue', 'predictions'
             ]].groupby('fullVisitorId').agg(aggregations)
        del test_df
        gc.collect()

        # reshape header
        train_df_agg.columns = pd.Index(
            [e[0] + "_" + e[1].upper() for e in train_df_agg.columns.tolist()])
        test_df_agg.columns = pd.Index(
            [e[0] + "_" + e[1].upper() for e in test_df_agg.columns.tolist()])

        # to float32
        train_df_agg = train_df_agg.astype('float32')
        test_df_agg = test_df_agg.astype('float32')

        # save pkl
        to_pickles(train_df_agg,
                   '../output/train_df_agg_xgb',
                   split_size=50,
                   inplace=False)
        to_pickles(test_df_agg,
                   '../output/test_df_agg_xgb',
                   split_size=5,
                   inplace=False)

    # Cross validation model
    folds_agg = get_folds(df=train_df_agg[['totals.pageviews_MEAN'
                                           ]].reset_index(),
                          n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds_agg = np.zeros(train_df_agg.shape[0])
    sub_preds_agg = np.zeros(test_df_agg.shape[0])
    feature_importance_df_agg = pd.DataFrame()
    feats_agg = [
        f for f in train_df_agg.columns
        if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue_SUM']
    ]

    # submit file生成用
    test_df_agg_index = test_df_agg.index

    # final predict用にdmatrix形式のtest dfを作っておきます
    test_df_agg = xgb.DMatrix(
        test_df_agg[feats_agg],
        label=test_df_agg['totals.transactionRevenue_SUM'])

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds_agg):
        train_x, train_y = train_df_agg[feats_agg].iloc[train_idx], np.log1p(
            train_df_agg['totals.transactionRevenue_SUM'].iloc[train_idx])
        valid_x, valid_y = train_df_agg[feats_agg].iloc[valid_idx], np.log1p(
            train_df_agg['totals.transactionRevenue_SUM'].iloc[valid_idx])

        # set data structure
        xgb_train = xgb.DMatrix(train_x, label=train_y)
        xgb_test = xgb.DMatrix(valid_x, label=valid_y)

        # gridsearchできないのでlightgbmと同じparamsを使います
        params = {
            'objective': 'gpu:reg:linear',  # GPU parameter
            'booster': 'gbtree',
            'eval_metric': 'rmse',
            'silent': 1,
            'eta': 0.01,
            'max_depth': 7,
            'min_child_weight': 0.14207610657307,
            'gamma': 0.46299516643071,
            'subsample': 0.740095188787127,
            'colsample_bytree': 0.698723156053225,
            'colsample_bylevel': 0.306359150497576,
            'alpha': 14.3019796761524,
            'lambda': 9.48248448679231,
            'tree_method': 'gpu_hist',  # GPU parameter
            'predictor': 'gpu_predictor',  # GPU parameter
            'seed': int(2**n_fold)
        }

        reg = xgb.train(params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_train, 'train'), (xgb_test, 'test')],
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        reg.save_model('../output/models/xgb_user_' + str(n_fold) + '.txt')

        oof_preds_agg[valid_idx] = np.expm1(reg.predict(xgb_test))
        sub_preds_agg += np.expm1(reg.predict(test_df_agg)) / num_folds

        fold_importance_df = pd.DataFrame.from_dict(
            reg.get_score(importance_type='gain'),
            orient='index',
            columns=['importance'])
        fold_importance_df["feature"] = fold_importance_df.index.tolist()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df_agg = pd.concat(
            [feature_importance_df_agg, fold_importance_df], axis=0)

        print('Fold %2d RMSE : %.6f' %
              (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_agg[valid_idx]))))
        del reg, train_x, train_y, valid_x, valid_y, fold_importance_df
        gc.collect()

    # Full RMSEスコアの表示&LINE通知
    full_rmse_agg = rmse(
        np.log1p(train_df_agg['totals.transactionRevenue_SUM']),
        np.log1p(oof_preds_agg))
    line_notify('Visitor Level Full RMSE score %.6f' % full_rmse_agg)

    # session level feature importance
    display_importances(feature_importance_df_agg,
                        '../output/xgb_importances_agg.png',
                        '../output/feature_importance_xgb_agg.csv')

    if not debug:
        # 提出データの予測値を保存
        submission = pd.DataFrame()
        submission['PredictedLogRevenue'] = sub_preds_agg
        submission.index = test_df_agg_index
        submission['PredictedLogRevenue'] = np.log1p(
            submission['PredictedLogRevenue'])
        submission['PredictedLogRevenue'] = submission[
            'PredictedLogRevenue'].apply(lambda x: 0.0 if x < 0 else x)
        submission['PredictedLogRevenue'] = submission[
            'PredictedLogRevenue'].fillna(0)
        submission.to_csv(submission_file_name, index=True)

        # out of foldの予測値を保存
        train_df_agg['OOF_PRED'] = oof_preds_agg
        train_df_agg[['OOF_PRED',
                      'totals.transactionRevenue_SUM']].to_csv(oof_file_name,
                                                               index=True)

        # API経由でsubmit
        submit(submission_file_name, comment='cv: %.6f' % full_rmse_agg)
示例#5
0
def main():
    # load datasets
    print('Loading Datasets...')

    test_df = loadpkl('../output/test_df.pkl')
    train_df = loadpkl('../output/train_df.pkl')
    oof_preds = loadpkl('../output/oof_preds.pkl')
    oof_preds = np.array([downsample(img) for img in oof_preds])

    # is_saltが0のデータを除外
    #    train_df = train_df[train_df['is_salt']==1]

    x_test = np.array([(upsample(
        np.array(
            load_img("../input/test/images/{}.png".format(idx),
                     color_mode="grayscale")))) / 255
                       for idx in tqdm(test_df.index)
                       ]).reshape(-1, IMG_SIZE_TARGET, IMG_SIZE_TARGET, 1)
    y_train = np.array(train_df.masks.tolist()).reshape(
        -1, IMG_SIZE_ORI, IMG_SIZE_ORI, 1)

    # 結果保存用の配列
    sub_preds = np.zeros((x_test.shape[0], IMG_SIZE_ORI, IMG_SIZE_ORI))

    # (128, 128, 3)に変換
    x_test = np.repeat(x_test, 3, axis=3)

    print('Generating submission file...')

    # foldごとのモデルを読み込んでsubmission用の予測値を算出
    for n_fold in range(NUM_FOLDS):

        # load model
        model = load_model(
            '../output/UnetResNet34_pretrained_bce_dice_' + str(n_fold) +
            '.model',
            custom_objects={
                'my_iou_metric': my_iou_metric,
                'bce_dice_loss': bce_dice_loss
                #                                           'keras_lovasz_softmax':keras_lovasz_softmax
            })

        # Test time augmnentationを追加しました
        #        tta_model = tta_segmentation(model, h_flip=True, v_flip= True, rotation_angles=(90, 180, 270),
        #                                     h_shifts=(-5, 5), merge='mean')

        # testデータの予測値を保存
        sub_preds_single = np.array([
            downsample(x) for x in tqdm(
                predict_result(model, x_test, IMG_SIZE_TARGET).reshape(
                    -1, IMG_SIZE_TARGET, IMG_SIZE_TARGET))
        ])
        sub_preds += sub_preds_single / NUM_FOLDS

        # single modelのsubmission fileを保存(threshold=0.5)
        pred_dict_single = {
            idx: RLenc(np.round(sub_preds_single[i] > 0.5))
            for i, idx in enumerate(tqdm(test_df.index.values))
        }
        sub_single = pd.DataFrame.from_dict(pred_dict_single, orient='index')
        sub_single.index.names = ['id']
        sub_single.columns = ['rle_mask']
        #        sub_single.loc[~test_df['is_salt'],'rle_mask'] = np.nan # is_saltが0のデータを空欄にします。
        sub_single.to_csv('../output/submission_single_bce_dice_' +
                          str(n_fold) + '.csv')

        print('fold {} finished'.format(n_fold + 1))

        del model, sub_preds_single, pred_dict_single, sub_single  #, tta_model
        gc.collect()

    # best thresholdとIOUを算出
    threshold_best, iou_best = getBestThreshold(y_train, oof_preds,
                                                '../output/threshold.png')

    t1 = time.time()
    pred_dict = {
        idx: RLenc((np.round(sub_preds[i] > threshold_best)))
        for i, idx in enumerate(tqdm(test_df.index.values))
    }
    t2 = time.time()

    print("Usedtime = " + str(t2 - t1) + " s")

    sub = pd.DataFrame.from_dict(pred_dict, orient='index')
    sub.index.names = ['id']
    sub.columns = ['rle_mask']

    # is_saltが0のデータを空欄にします。
    #    sub.loc[~test_df['is_salt'],'rle_mask'] = np.nan

    sub.to_csv('../output/submission_bce_dice_7fold.csv')

    # 完了後にLINE通知を送信
    line_notify('Predicting.py finished. Best IoU score is %.6f' % iou_best)
示例#6
0
def kfold_lightgbm(df, num_folds, stratified = False, debug= False, use_pkl=False):

    # Divide in training/validation and test data
    train_df = df[~df['IS_TEST']]
    test_df = df[df['IS_TEST']]

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()

    ############################################################################
    # Session Level predictions
    ############################################################################

    print('Starting Session Level predictions...')

    # Cross validation model
    folds_session = get_folds(df=train_df, n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds_session = np.zeros(train_df.shape[0])
    sub_preds_session = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in EXCLUDED_FEATURES+['totals.transactionRevenue']]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds_session):
        train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(train_df['totals.transactionRevenue'].iloc[train_idx])
        valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(train_df['totals.transactionRevenue'].iloc[valid_idx])

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # パラメータは適当です
        params ={
                'device' : 'gpu',
#                'gpu_use_dp':True,
                'task': 'train',
                'boosting': 'gbdt',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'num_leaves': 64,
                'colsample_bytree': 0.769143040610826,
                'subsample': 0.295302403483027,
                'max_depth': 8,
                'reg_alpha': 9.37961252311552,
                'reg_lambda': 2.82500347706399,
                'min_split_gain': 0.153268455490808,
                'min_child_weight': 44,
                'min_data_in_leaf': 45,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        # save model
        reg.save_model('../output/models/lgbm_session_'+str(n_fold)+'.txt')

        oof_preds_session[valid_idx] = np.expm1(reg.predict(valid_x, num_iteration=reg.best_iteration))
        sub_preds_session += np.expm1(reg.predict(test_df[feats], num_iteration=reg.best_iteration)) / num_folds

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_session[valid_idx]))))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full RMSEスコアの表示&LINE通知
    full_rmse_session = rmse(np.log1p(train_df['totals.transactionRevenue']), np.log1p(oof_preds_session))
    line_notify('Session Level Full RMSE score %.6f' % full_rmse_session)

    # session level feature importance
    display_importances(feature_importance_df ,
                        '../output/lgbm_importances_session.png',
                        '../output/feature_importance_lgbm_session.csv')

    # 予測値を保存
    train_df.loc[:,'predictions'] = oof_preds_session
    test_df.loc[:,'predictions'] = sub_preds_session

    del oof_preds_session, sub_preds_session
    gc.collect()

    # csv形式でsave
    train_df['predictions'].to_csv("../output/oof_lgbm_session.csv")
    test_df['predictions'].to_csv("../output/sub_lgbm_session.csv")

    ############################################################################
    # User Level predictions
    ############################################################################

    print('Starting User Level predictions...')

    if use_pkl:

        del train_df, test_df
        gc.collect()

        # load pkl
        train_df_agg = read_pickles('../output/train_df_agg')
        test_df_agg = read_pickles('../output/test_df_agg')
    else:
        # Aggregate data at User level
        aggregations = {'totals.transactionRevenue': ['sum']}
        for col in feats+['predictions']:
            aggregations[col] = ['sum', 'max', 'min', 'mean']

        train_df_agg = train_df[feats+['fullVisitorId','totals.transactionRevenue', 'predictions']].groupby('fullVisitorId').agg(aggregations)
        del train_df
        gc.collect()

        test_df_agg = test_df[feats + ['fullVisitorId','totals.transactionRevenue', 'predictions']].groupby('fullVisitorId').agg(aggregations)
        del test_df
        gc.collect()

        # reshape header
        train_df_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in train_df_agg.columns.tolist()])
        test_df_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in test_df_agg.columns.tolist()])

        # to float32
        train_df_agg=train_df_agg.astype('float32')
        test_df_agg=test_df_agg.astype('float32')

        # save pkl
        to_pickles(train_df_agg, '../output/train_df_agg', split_size=50, inplace=False)
        to_pickles(test_df_agg, '../output/test_df_agg', split_size=5, inplace=False)

    # Cross validation model
    folds_agg = get_folds(df=train_df_agg[['totals.pageviews_MEAN']].reset_index(), n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds_agg = np.zeros(train_df_agg.shape[0])
    sub_preds_agg = np.zeros(test_df_agg.shape[0])
    feature_importance_df_agg = pd.DataFrame()
    feats_agg = [f for f in train_df_agg.columns if f not in EXCLUDED_FEATURES+['totals.transactionRevenue_SUM']]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds_agg):
        train_x, train_y = train_df_agg[feats_agg].iloc[train_idx], np.log1p(train_df_agg['totals.transactionRevenue_SUM'].iloc[train_idx])
        valid_x, valid_y = train_df_agg[feats_agg].iloc[valid_idx], np.log1p(train_df_agg['totals.transactionRevenue_SUM'].iloc[valid_idx])

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # params estimated by bayesian opt
        params ={
                'device' : 'gpu',
#                'gpu_use_dp':True,
                'task': 'train',
                'boosting': 'gbdt',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'num_leaves': 36,
                'colsample_bytree': 0.174047605805866,
                'subsample': 0.702214902667035,
                'max_depth': 8,
                'reg_alpha': 9.91242460129322,
                'reg_lambda': 0.357672819483952,
                'min_split_gain': 0.631115489088361,
                'min_child_weight': 15,
                'min_data_in_leaf': 9,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        # save model
        reg.save_model('../output/models/lgbm_user_'+str(n_fold)+'.txt')

        oof_preds_agg[valid_idx] = np.expm1(reg.predict(valid_x, num_iteration=reg.best_iteration))
        sub_preds_agg += np.expm1(reg.predict(test_df_agg[feats_agg], num_iteration=reg.best_iteration)) / num_folds

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats_agg
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df_agg = pd.concat([feature_importance_df_agg, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_agg[valid_idx]))))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full RMSEスコアの表示&LINE通知
    full_rmse_agg = rmse(np.log1p(train_df_agg['totals.transactionRevenue_SUM']), np.log1p(oof_preds_agg))
    line_notify('Visitor Level Full RMSE score %.6f' % full_rmse_agg)

    # session level feature importance
    display_importances(feature_importance_df_agg ,
                        '../output/lgbm_importances_agg.png',
                        '../output/feature_importance_lgbm_agg.csv')

    if not debug:
        # 提出データの予測値を保存
        test_df_agg.loc[:,'PredictedLogRevenue'] = sub_preds_agg
        submission = test_df_agg[['PredictedLogRevenue']]
        submission['PredictedLogRevenue'] = np.log1p(submission['PredictedLogRevenue'])
        submission['PredictedLogRevenue'] =  submission['PredictedLogRevenue'].apply(lambda x : 0.0 if x < 0 else x)
        submission['PredictedLogRevenue'] = submission['PredictedLogRevenue'].fillna(0)
        submission.to_csv(submission_file_name, index=True)

        # out of foldの予測値を保存
        train_df_agg['OOF_PRED'] = oof_preds_agg
        train_df_agg[['OOF_PRED', 'totals.transactionRevenue_SUM']].to_csv(oof_file_name, index= True)

        # API経由でsubmit
        submit(submission_file_name, comment='cv: %.6f' % full_rmse_agg)