예제 #1
0
def get_oof_feature(oof_path='../oof_feature/*.gz',
                    key='',
                    pred_col='prediction'):
    feat_path_list = glob.glob(oof_path)
    oof_list = []
    for path in feat_path_list:
        oof = utils.read_pkl_gzip(path)
        oof_name = oof.columns.tolist()[1]
        oof = oof.set_index(key)[pred_col]
        oof.name = "oof_" + oof_name
        oof_list.append(oof)
    df_oof = pd.concat(oof_list, axis=1)
    return df_oof
예제 #2
0
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMN_GROUP = 'DT-M'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date']

paths_train = glob('../feature/raw_use/*_train.gz')
paths_train += sorted(glob('../feature/org_use/*_train.gz'))
#  paths_train += sorted(glob('../feature/sub_use/*_train.gz'))
#  paths_train += sorted(glob('../feature/valid_use/*_train.gz'))

df_train = parallel_load_data(paths_train)

group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz'
group = read_pkl_gzip(group_kfold_path)
df_train[COLUMN_GROUP] = group


#========================================================================
# Negative Down Sampling
#========================================================================
frac = 0.2
np.random.seed(seed)
df_pos = df_train[df_train.isFraud==1]
df_neg = df_train[df_train.isFraud!=1]
del df_train
gc.collect()
df_neg = df_neg.sample(int(df_neg.shape[0] * frac))
df_train = pd.concat([df_pos, df_neg], axis=0)