Пример #1
0
def ieee_cv(logger, df_train, Y, df_test, COLUMN_GROUP, use_cols, params={},  cols_categorical=[], is_adv=False, is_valid=False):
    start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
    seed       = params['seed']
    model_type = params['model_type']
    n_splits = params['n_splits']
    validation = params['fold']
    early_stopping_rounds = params['early_stopping_rounds']
    
    del params['seed']
    del params['model_type']
    del params['n_splits']
    del params['fold']
#     del params['model_type'], params['n_splits'], params['fold']
    
    if validation=="stratified":
        kfold = list(StratifiedKFold(n_splits=n_splits, random_state=seed).split(df_train, Y))
    elif validation=='group':
#         tmp_kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP]))
#         kfold = [tmp_kfold[3], tmp_kfold[5], tmp_kfold[1], tmp_kfold[4], tmp_kfold[2], tmp_kfold[0]]
        kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP]))
        
    score_list = []
    feim_list = []
    best_iteration = 0
    y_pred = np.zeros(len(df_train))
    test_preds = []
    
    if len(df_test):
        x_test = df_test
    else:
        x_test = []
    
    for n_fold, (trn_idx, val_idx) in enumerate(kfold):
        
#         if n_fold!=3:
#             continue
        
        x_train = df_train.iloc[trn_idx]
        y_train = Y.iloc[trn_idx]
        x_valid = df_train.iloc[val_idx]
        y_valid = Y.iloc[val_idx]
        
#         if n_fold != 0:
#             probing = pd.read_csv('../input/20190929_probing.csv')
#             probing = probing[probing['Probing_isFraud']==1]
#             probing_ids = probing[COLUMN_ID].values
#             y_probing = probing['Probing_isFraud']
#             y_probing.name = COLUMN_TARGET
            
#             probing_train = x_test[x_test[COLUMN_ID].isin(probing_ids)]
#             print(x_train.shape, y_train.shape)
#             x_train = pd.concat([x_train, probing_train], axis=0)
#             y_train = pd.concat([y_train, y_probing], axis=0)
#             print(x_train.shape, y_train.shape)
            
        x_train = x_train[use_cols]
        x_valid = x_valid[use_cols]
        
        val_gr = df_train.iloc[val_idx][COLUMN_GROUP].value_counts()
        dtm = val_gr.index.tolist()[0]
        print("="*20)
        with timer(f"  * Fold{n_fold} Validation-{COLUMN_GROUP} {dtm}: {val_gr.values[0]}"):
            score, oof_pred, test_pred, feim, best_iter, _ = Classifier(
                model_type=model_type,
                x_train=x_train,
                y_train=y_train,
                x_valid=x_valid,
                y_valid=y_valid,
                x_test=x_test[use_cols],
                params=params,
                early_stopping_rounds = early_stopping_rounds,
                cols_categorical = cols_categorical
            )
            
#         if not is_adv:
        pb, pv, al = bear_validation(test_pred)
        
        logger.info(f"  * Fold{n_fold} {dtm}: {score} | Bear's...PB:{pb} PV:{pv} All:{al}")
        print("="*20)

        score_list.append(score)
        best_iteration += best_iter/n_splits
        y_pred[val_idx] = oof_pred
        test_preds.append(test_pred)
        
        if len(feim):
            feim.rename(columns={'importance': f'imp_fold{n_fold+1}'}, inplace=True)
            feim.set_index('feature', inplace=True)
            feim_list.append(feim)
        
    cv_score = np.mean(score_list)
    cvs = str(cv_score).replace('.', '-')
    
    if len(feim):
        df_feim = pd.concat(feim_list, axis=1)
        df_feim['imp_avg'] = df_feim.mean(axis=1)
        df_feim.sort_values(by='imp_avg', ascending=False, inplace=True)
    else:
        df_feim = []
    
    ## Save
    # Each Fold Test Pred
    to_pkl_gzip(obj=test_preds, path=f'../output/fold_test_pred/{start_time}_Each_Fold__CV{cvs}__feature{len(use_cols)}')
    # Feature Importance
    if len(feim):
        to_pkl_gzip(obj=df_feim, path=f"../output/feature_importances/{start_time}__CV{cvs}__feature{len(use_cols)}")
    
    
    #========================================================================
    # Adversarial Validationもこちらの関数を使うので、Adversarialの場合はここで終わり
    #========================================================================
    if is_adv:
        pred_result = pd.Series(y_pred, index=df_train[COLUMN_ID].values, name='adv_pred_' + start_time)
        return 0, cv_score, df_feim, pred_result, [], []
    
    with timer("  * Make Prediction Result File."):
        if is_valid:
            pred_result = []
        else:
            test_pred_avg = np.mean(test_preds, axis=0)
            all_pred = np.append(y_pred, test_pred_avg)
            all_ids = np.append(df_train[COLUMN_ID].values, df_test[COLUMN_ID].values)
            pred_result = pd.DataFrame([all_ids, all_pred], index=[COLUMN_ID, 'pred_' + start_time]).T
            pred_result[COLUMN_ID] = pred_result[COLUMN_ID].astype('int')
            
            #========================================================================
            # Save
            #========================================================================
            # Prediction
            to_pkl_gzip(obj=pred_result, path=f"../output/pred_result/{start_time}__CV{cvs}__all_preds")
            # Submit File
            pred_result.columns = [COLUMN_ID, COLUMN_TARGET]
            pred_result.iloc[len(df_train):].to_csv(f"../submit/tmp/{start_time}__CV{cvs}__feature{len(use_cols)}.csv", index=False)
    
    return best_iteration, cv_score, df_feim, pred_result, score_list, test_preds
Пример #2
0
 
 
 if is_base or len(valid_path)==0:
     tmp_train = df_train.copy()
     feature_name = 'base'
 else:
     df_feat_train = parallel_load_data(valid_path)
     tmp_train = df_train.join(df_feat_train)
     feature_name = get_filename(valid_path[0])
 
 use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE]
 
 cnt = 0    
 cv = 0
 for fold in range(3):
     with timer('  * Make Dataset'):
         if fold==0:
             train = tmp_train[
                 (tmp_train[COLUMN_GROUP] == '2017-12') | 
                 (tmp_train[COLUMN_GROUP] == '2018-1') | 
                 (tmp_train[COLUMN_GROUP] == '2018-2') | 
                 (tmp_train[COLUMN_GROUP] == '2018-3') | 
                 (tmp_train[COLUMN_GROUP] == '2018-4')
                 ]
             test  = tmp_train[tmp_train[COLUMN_GROUP] == '2018-5']
         elif fold==1:
             train = tmp_train[
                 (tmp_train[COLUMN_GROUP] == '2017-12') | 
                 (tmp_train[COLUMN_GROUP] == '2018-1') | 
                 (tmp_train[COLUMN_GROUP] == '2018-2') | 
                 (tmp_train[COLUMN_GROUP] == '2018-3') |
Пример #3
0
    y_valid = Y.iloc[val_idx]

    x_train = x_train[~x_train[COLUMN_TARGET].isnull()][use_cols]
    x_trn_idx = x_train.index
    x_valid = x_valid[~x_valid[COLUMN_TARGET].isnull()][use_cols]
    x_val_idx = x_valid.index
    y_train = y_train.loc[x_trn_idx]
    y_valid = y_valid.loc[x_val_idx]

    base_valid = tmp_train.iloc[val_idx][use_cols]

    val_gr = tmp_train.iloc[val_idx][COLUMN_GROUP].value_counts()
    dtm = val_gr.index.tolist()[0]
    print("=" * 20)
    with timer(
            f"  * Fold{n_fold} Validation-{COLUMN_GROUP} {dtm}: {val_gr.values[0]}"
    ):
        score, oof_pred, test_pred, feim, _ = Regressor(
            base_valid,
            model_type=model_type,
            x_train=x_train,
            y_train=y_train,
            x_valid=x_valid,
            y_valid=y_valid,
            x_test=x_test,
            params=params,
            early_stopping_rounds=early_stopping_rounds,
        )

    score_list.append(score)
    y_pred[val_idx] = oof_pred