def eval_check_feature(df_train, df_test, is_corr=False): # 情報をもたない or 重複してるようなfeatureを除く print("* Check Unique Feature.") list_unique_drop = drop_unique_feature(df_train, df_test) if len(list_unique_drop): print(f" * {len(list_unique_drop)}feature unique drop and move trush") print(list_unique_drop) for col in list(set(list_unique_drop)): from_dir = 'valid' to_dir = 'valid_trush' # if col.count('raw'): # from_dir = 'raw_use' # to_dir = 'raw_trush' # else: # from_dir = 'org_use' # to_dir = 'org_trush' try: move_feature([col], from_dir, to_dir) except FileNotFoundError: from_dir = 'valid' to_dir = 'valid_trush' move_feature([col], from_dir, to_dir) return list_unique_drop
if cnt==3: with open(check_score_path, 'a') as f: line = f'{feature_name},{cv}\n' f.write(line) df_score = pd.read_csv(check_score_path, header=None) if len(df_score)>2: from_dir = 'valid' to_dir = 'sub_use' df_score.columns = ['feature', 'score'] df_score.sort_values(by='score', ascending=False, inplace=True) best_feature = df_score['feature'].values[0] if best_feature.count('_train'): best_feature = best_feature.replace('_train', '') move_feature([best_feature], from_dir, to_dir) os.system(f'rm {check_score_path}') os.system(f'touch {check_score_path}') #======================================================================== # PostProcess #======================================================================== to_dir = '../feature/check_trush/' with timer(" * PostProcess"): for path in valid_path: try: shutil.move(path, to_dir) shutil.move(path.replace('_train', '_test'), to_dir) except FileNotFoundError: print(feature_name)
tmp_train = df_train.join(df_feat_train) tmp_test = df_test.join(df_feat_test) #======================================================================== # Train Test で片方に存在しないFeatureを除外 #======================================================================== diff_cols = list(set(tmp_train.columns) - set(tmp_test.columns)) for col in list(set(diff_cols)): if col.count('raw'): from_dir = 'raw_use' to_dir = 'raw_trush' else: from_dir = 'org_use' to_dir = 'org_trush' move_feature([col], from_dir, to_dir) tmp_train.drop(diff_cols, axis=1, inplace=True) #======================================================================== # GroupKFold #======================================================================== group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz' group = read_pkl_gzip(group_kfold_path) tmp_train[COLUMN_GROUP] = group # same_user_path = '../output/same_user_pattern/20190901_user_ids_share.csv' # same_user_path = '../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv' model_type = "lgb" params = { 'n_jobs': 64, # 'n_jobs': 48,
tmp_train = df_train.join(df_feat_train) tmp_test = df_test.join(df_feat_test) else: tmp_train = df_train tmp_test = df_test #======================================================================== # Train Test で片方に存在しないFeatureを除外 #======================================================================== diff_cols = list(set(tmp_train.columns) - set(tmp_test.columns)) for col in list(set(diff_cols)): from_dir = 'valid' to_dir = 'valid_trush' move_feature([col], from_dir, to_dir) tmp_train.drop(diff_cols, axis=1, inplace=True) print(f" * Diff Features: {len(diff_cols)}") # same_user_path = '../output/same_user_pattern/20190901_user_ids_share.csv' # same_user_path = '../output/same_user_pattern/0902__same_user_id__card_addr_pemail_M.csv' group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz' group = read_pkl_gzip(group_kfold_path) tmp_train[COLUMN_GROUP] = group # 2017-12抜いてみる # if not has_dec: # tmp_train = tmp_train[tmp_train[COLUMN_GROUP]!='2017-12'] # Y = Y.loc[tmp_train.index] # n_splits = 5