Пример #1
0
def save_feature(df_feat,
                 prefix,
                 dir_save,
                 is_train,
                 auto_type=True,
                 list_ignore=[],
                 is_check=False,
                 is_viz=True):

    DIR_FEATURE = Path('../feature') / dir_save
    length = len(df_feat)
    if is_check:
        for col in df_feat.columns:
            if col in list_ignore:
                continue
            # Nullがあるかどうか
            null_len = df_feat[col].dropna().shape[0]
            if length - null_len > 0:
                print(f"{col}  | null shape: {length - null_len}")
            # infがあるかどうか
            max_val = df_feat[col].max()
            min_val = df_feat[col].min()
            if max_val == np.inf or min_val == -np.inf:
                print(f"{col} | max: {max_val} | min: {min_val}")
        print("  * Finish Feature Check.")
        sys.exit()

    for col in df_feat.columns:
        if col in list_ignore:
            continue
        if auto_type:
            feature = df_feat[col].values.astype('float32')
        else:
            feature = df_feat[col].values
        if is_train:
            feat_path = DIR_FEATURE / f'{prefix}__{col}_train'
        else:
            feat_path = DIR_FEATURE / f'{prefix}__{col}_test'

        if os.path.exists(str(feat_path) + '.gz'):
            continue
        else:
            if is_viz:
                print(f"{feature.shape} | {col}")
            utils.to_pkl_gzip(path=str(feat_path), obj=feature)
Пример #2
0
def ieee_cv(logger, df_train, Y, df_test, COLUMN_GROUP, use_cols, params={},  cols_categorical=[], is_adv=False, is_valid=False):
    start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13]
    seed       = params['seed']
    model_type = params['model_type']
    n_splits = params['n_splits']
    validation = params['fold']
    early_stopping_rounds = params['early_stopping_rounds']
    
    del params['seed']
    del params['model_type']
    del params['n_splits']
    del params['fold']
#     del params['model_type'], params['n_splits'], params['fold']
    
    if validation=="stratified":
        kfold = list(StratifiedKFold(n_splits=n_splits, random_state=seed).split(df_train, Y))
    elif validation=='group':
#         tmp_kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP]))
#         kfold = [tmp_kfold[3], tmp_kfold[5], tmp_kfold[1], tmp_kfold[4], tmp_kfold[2], tmp_kfold[0]]
        kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP]))
        
    score_list = []
    feim_list = []
    best_iteration = 0
    y_pred = np.zeros(len(df_train))
    test_preds = []
    
    if len(df_test):
        x_test = df_test
    else:
        x_test = []
    
    for n_fold, (trn_idx, val_idx) in enumerate(kfold):
        
#         if n_fold!=3:
#             continue
        
        x_train = df_train.iloc[trn_idx]
        y_train = Y.iloc[trn_idx]
        x_valid = df_train.iloc[val_idx]
        y_valid = Y.iloc[val_idx]
        
#         if n_fold != 0:
#             probing = pd.read_csv('../input/20190929_probing.csv')
#             probing = probing[probing['Probing_isFraud']==1]
#             probing_ids = probing[COLUMN_ID].values
#             y_probing = probing['Probing_isFraud']
#             y_probing.name = COLUMN_TARGET
            
#             probing_train = x_test[x_test[COLUMN_ID].isin(probing_ids)]
#             print(x_train.shape, y_train.shape)
#             x_train = pd.concat([x_train, probing_train], axis=0)
#             y_train = pd.concat([y_train, y_probing], axis=0)
#             print(x_train.shape, y_train.shape)
            
        x_train = x_train[use_cols]
        x_valid = x_valid[use_cols]
        
        val_gr = df_train.iloc[val_idx][COLUMN_GROUP].value_counts()
        dtm = val_gr.index.tolist()[0]
        print("="*20)
        with timer(f"  * Fold{n_fold} Validation-{COLUMN_GROUP} {dtm}: {val_gr.values[0]}"):
            score, oof_pred, test_pred, feim, best_iter, _ = Classifier(
                model_type=model_type,
                x_train=x_train,
                y_train=y_train,
                x_valid=x_valid,
                y_valid=y_valid,
                x_test=x_test[use_cols],
                params=params,
                early_stopping_rounds = early_stopping_rounds,
                cols_categorical = cols_categorical
            )
            
#         if not is_adv:
        pb, pv, al = bear_validation(test_pred)
        
        logger.info(f"  * Fold{n_fold} {dtm}: {score} | Bear's...PB:{pb} PV:{pv} All:{al}")
        print("="*20)

        score_list.append(score)
        best_iteration += best_iter/n_splits
        y_pred[val_idx] = oof_pred
        test_preds.append(test_pred)
        
        if len(feim):
            feim.rename(columns={'importance': f'imp_fold{n_fold+1}'}, inplace=True)
            feim.set_index('feature', inplace=True)
            feim_list.append(feim)
        
    cv_score = np.mean(score_list)
    cvs = str(cv_score).replace('.', '-')
    
    if len(feim):
        df_feim = pd.concat(feim_list, axis=1)
        df_feim['imp_avg'] = df_feim.mean(axis=1)
        df_feim.sort_values(by='imp_avg', ascending=False, inplace=True)
    else:
        df_feim = []
    
    ## Save
    # Each Fold Test Pred
    to_pkl_gzip(obj=test_preds, path=f'../output/fold_test_pred/{start_time}_Each_Fold__CV{cvs}__feature{len(use_cols)}')
    # Feature Importance
    if len(feim):
        to_pkl_gzip(obj=df_feim, path=f"../output/feature_importances/{start_time}__CV{cvs}__feature{len(use_cols)}")
    
    
    #========================================================================
    # Adversarial Validationもこちらの関数を使うので、Adversarialの場合はここで終わり
    #========================================================================
    if is_adv:
        pred_result = pd.Series(y_pred, index=df_train[COLUMN_ID].values, name='adv_pred_' + start_time)
        return 0, cv_score, df_feim, pred_result, [], []
    
    with timer("  * Make Prediction Result File."):
        if is_valid:
            pred_result = []
        else:
            test_pred_avg = np.mean(test_preds, axis=0)
            all_pred = np.append(y_pred, test_pred_avg)
            all_ids = np.append(df_train[COLUMN_ID].values, df_test[COLUMN_ID].values)
            pred_result = pd.DataFrame([all_ids, all_pred], index=[COLUMN_ID, 'pred_' + start_time]).T
            pred_result[COLUMN_ID] = pred_result[COLUMN_ID].astype('int')
            
            #========================================================================
            # Save
            #========================================================================
            # Prediction
            to_pkl_gzip(obj=pred_result, path=f"../output/pred_result/{start_time}__CV{cvs}__all_preds")
            # Submit File
            pred_result.columns = [COLUMN_ID, COLUMN_TARGET]
            pred_result.iloc[len(df_train):].to_csv(f"../submit/tmp/{start_time}__CV{cvs}__feature{len(use_cols)}.csv", index=False)
    
    return best_iteration, cv_score, df_feim, pred_result, score_list, test_preds
Пример #3
0
                              valid_sets=lgb_valid,
                              early_stopping_rounds=early_stopping_rounds,
                              num_boost_round=num_boost_round,
                              verbose_eval=200)
        best_iter = estimator.best_iteration

        oof_pred = estimator.predict(x_valid)
        score = roc_auc_score(y_valid, oof_pred)
        cvs = str(score).replace('.', '-')
        feim = get_tree_importance(estimator=estimator,
                                   use_cols=x_train.columns)
        feim.sort_values(by='importance', ascending=False, inplace=True)
        feim['is_valid'] = feim['feature'].map(valid_map)

    #========================================================================
    # PostProcess
    #========================================================================

    with timer("  * PostProcess"):
        to_pkl_gzip(
            obj=feim,
            path=
            f"../output/selection_feature/{start_time}__CV{cvs}__feature{len(use_cols)}"
        )
        for path in valid_paths_train:
            try:
                shutil.move(path, to_dir)
                shutil.move(path.replace('train', 'test'), to_dir)
            except FileNotFoundError:
                print(feature_name)
Пример #4
0
#             # 三行もたないfeatureは各foldをクリアできなかった
#             if score < base_fold_score[fold]:
#                 break
#             else:
#                 cnt +=1
#                 cv += score/3

    df_feim = pd.concat(feim_list, axis=1)
    df_feim['imp_avg'] = df_feim.mean(axis=1)
    df_feim.sort_values(by='imp_avg', ascending=False, inplace=True)
    avg_score = str(np.mean(score_list))[:9].replace('.', '-')

    to_pkl_gzip(
        obj=df_feim,
        path=
        f"../output/feature_importances/{start_time}__bear_valid__CV{avg_score}__feature{len(use_cols)}"
    )

    if cnt == 3:
        with open(check_score_path, 'a') as f:
            line = f'{feature_name},{cv}\n'
            f.write(line)

        df_score = pd.read_csv(check_score_path, header=None)
        if len(df_score) > 2:
            from_dir = 'valid'
            to_dir = 'sub_use'
            df_score.columns = ['feature', 'score']
            df_score.sort_values(by='score', ascending=False, inplace=True)
            best_feature = df_score['feature'].values[0]
Пример #5
0
    feim.rename(columns={'importance': f'imp_fold{n_fold+1}'}, inplace=True)
    feim.set_index('feature', inplace=True)
    feim_list.append(feim)

cv_score = np.mean(score_list)
cvs = str(cv_score).replace('.', '-')
df_feim = pd.concat(feim_list, axis=1)
df_feim['imp_avg'] = df_feim.mean(axis=1)
df_feim.sort_values(by='imp_avg', ascending=False, inplace=True)

## Save
# Feature Importance
to_pkl_gzip(
    obj=df_feim,
    path=
    f"../output/feature_importances/{start_time}__CV{cvs}__{COLUMN_TARGET}__feature{len(use_cols)}"
)

with timer("  * Make Prediction Result File."):
    test_pred_avg = np.mean(test_preds, axis=0)
    all_pred = np.append(y_pred, test_pred_avg)
    all_ids = np.append(tmp_train[COLUMN_ID].values, df_test[COLUMN_ID].values)
    pred_result = pd.DataFrame([all_ids, all_pred],
                               index=[COLUMN_ID, 'pred_' + start_time]).T
    pred_result[COLUMN_ID] = pred_result[COLUMN_ID].astype('int')

    #========================================================================
    # Save
    #========================================================================
    # Prediction