def get_train_test(feat_path_list, base=[], target='target'): print(base.shape) feature_list = utils.parallel_load_data(path_list=feat_path_list) df_feat = pd.concat(feature_list, axis=1) df_feat = pd.concat([base, df_feat], axis=1) train = df_feat[~df_feat[target].isnull()].reset_index(drop=True) test = df_feat[df_feat[target].isnull()].reset_index(drop=True) return train, test
save_file_path = '../output/valid_single_feature.csv' check_score_path = 'check_score.csv' COLUMN_ID = 'TransactionID' COLUMN_DT = 'TransactionDT' COLUMN_TARGET = 'isFraud' COLUMN_GROUP = 'DT-M' COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date'] paths_train = glob('../feature/raw_use/*_train.gz') paths_train += sorted(glob('../feature/org_use/*_train.gz')) # paths_train += sorted(glob('../feature/sub_use/*_train.gz')) # paths_train += sorted(glob('../feature/valid_use/*_train.gz')) df_train = parallel_load_data(paths_train) group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz' group = read_pkl_gzip(group_kfold_path) df_train[COLUMN_GROUP] = group #======================================================================== # Negative Down Sampling #======================================================================== frac = 0.2 np.random.seed(seed) df_pos = df_train[df_train.isFraud==1] df_neg = df_train[df_train.isFraud!=1] del df_train gc.collect()
if path.count(''): return True else: return False paths_train = glob('../submit/re_sub/*_train.gz') paths_test = glob('../submit/re_sub/*_test.gz') # paths_train += glob('../submit/re_sub/Tran*_train.gz') # paths_test += glob('../submit/re_sub/Tran*_test.gz') # paths_train += glob('../submit/re_sub/is*_train.gz') # paths_test += glob('../submit/re_sub/is*_test.gz') print(len(paths_train)) df_train = parallel_load_data(paths_train) df_test = parallel_load_data(paths_test) ### DT-M group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz' group = read_pkl_gzip(group_kfold_path) df_train[COLUMN_GROUP] = group Y = df_train[COLUMN_TARGET] is_submit = [True, False][0] n_splits = 6 set_type = 'new_set' tmp_train = df_train tmp_test = df_test