def corr_check(df=[]): corr = df.corr(method='pearson') print(corr) sys.exit() # corr = corr.sort_index(axis=1) # corr = corr.unstack().reset_index().rename(columns={'level_0': 'feature', 'level_1':'feature_2', 0:'corr'}) importance = pd.read_csv( '../output/cv_feature1099_importances_auc_0.8072030486159842.csv')[[ 'feature', 'rank' ]] df = importance.query("rank<=200") # df = corr.merge(importance, on='feature', how='inner') # df = df.query("rank<=200") # importance.rename(columns={'feature':'feature_2', 'rank':'rank_2'}, inplace=True) # df = df.merge(importance, on='feature_2', how='inner') # df['corr'] = np.abs(df['corr']) feature_list = df['feature'].drop_duplicates().values base = pd.read_csv('../data/base.csv') path_list = glob.glob('../features/3_winner/*.npy') for i in range(20): start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) # tmp_feature_list = df.query(f'''feature=="{feat}"''')['feature_2'].values seed = np.random.randint(0, 100000) + 605 np.random.seed(seed=seed) emb_list = np.random.choice(feature_list, 10, replace=False) use_paths = [] for elem in emb_list: for path in path_list: if path.count(elem): use_paths.append(path) break logger.info(f'SELECT PATH: {len(use_paths)}') data = make_feature_set(base[unique_id].to_frame(), path='', use_feature=use_paths).set_index(unique_id) for col in data.columns: data[col] = data[col].replace(np.inf, np.nan) data[col] = data[col].replace(-1 * np.inf, np.nan) data[col] = data[col].fillna(data[col].median()) # df_emb = UMAP(data=data, D=2) df_emb = t_SNE(data=data, D=2) df_emb = pd.DataFrame(data=df_emb, columns=['x', 'y']) df_emb[unique_id] = base[unique_id].values df_emb.to_csv(f'../output/{start_time}_umap_seed{seed}.csv', index=False)
def check_feature_detail(path): base = pd.read_csv('../data/base.csv') df = make_feature_set(base[unique_id].to_frame(), path) for col in df.columns: if col in ignore_features: continue print(df[col].drop_duplicates().sort_values())
def make_feature_manage_table(): app_cat_list = application_cat() prev_cat_list = previous_cat() prev_num_list = previous_num() prev_num = pd.Series(prev_num_list, name='prev_num').to_frame() table = pd.DataFrame([]) for app_cat in app_cat_list: tmp_table = pd.DataFrame([]) for prev_cat in prev_cat_list: prev_num['prev_cat'] = prev_cat if len(tmp_table) == 0: tmp_table = prev_num.copy() else: tmp_table = pd.concat([tmp_table, prev_num], axis=0) tmp_table['app_cat'] = app_cat if len(table) == 0: table = tmp_table else: table = pd.concat([table, tmp_table], axis=0) logger.info(f'table shape: {table.shape}') table['make_flg'] = 0 base = pd.read_csv('../data/base.csv') ' 特徴量セットの確認 ' path = '../features/f_previous_feature/*.npy' path_list = glob.glob(path) dataset = make_feature_set(base, path) dataset = dataset.set_index(unique_id) key_list = ['prev_cat', 'app_cat', 'prev_num'] key_dict = {} key_dict = check_loop(key_dict) for col in dataset.columns: for app_cat in app_cat_list: for prev_cat in prev_cat_list: for prev_num in prev_num_list: if col.count(app_cat) and col.count( prev_cat) and col.count(prev_num): key = f'{app_cat}_{prev_cat}_{prev_num}' if key_dict[key] == 1: continue elif key_dict[key] == 0: tmp = table.query(f"app_cat=='{app_cat}'").query( f"prev_cat=='{prev_cat}'").query( f"prev_num=='{prev_num}'") tmp['make_flg'] = 1 tmp_2 = table[key_list].merge(tmp, on=key_list, how='left') tmp_2.fillna(0, inplace=True) table['make_flg'] += tmp_2['make_flg'].values logger.info(col) logger.info(table.columns) table.to_csv(f'../output/{start_time[:12]}prev_cat_num_table.csv', index=False)
def main(): # data = pd.read_csv('../data/FULL_OLD_BURO_MMM.csv') # path = '../features/3_winner/*.npy' path = '../features/1_third_valid/*.npy' path = '../features/history/*.npy' base = pd.read_csv('../data/base.csv') data = make_feature_set(base[unique_id].to_frame(), path) # data = make_feature_set(base['is_train'].to_frame(), path) # data = make_feature_set(base[[unique_id, target]], path) data = make_feature_set( base[[unique_id, target, 'is_train', 'is_test', 'valid_no_4']], path) logger.info(data.shape) # for col in data.columns: # logger.info(f'\n{col}: {len(data[col][data[col]==np.inf])}') ' 特徴量セットの正規化verを作成する(NN / LR / EXT向け) ' data = data_regulize(df=data, na_flg=1, inf_flg=1, mm_flg=1, float16_flg=1, ignore_feature_list=ignore_features, logger=logger) data.to_csv('../data/regular_no_app_2.csv', index=False) logger.info(data.shape) # logger.info(data.head()) for col in data.columns: logger.info(data[col].drop_duplicates().sort_values) data.to_csv('../data/nn_history.csv', index=False) sys.exit() ' 正規化 ' ' infのreplace ' ' NaN埋め ' # logger.info(f'\n{col}: {len(data[col][data[col]==np.inf])}') # check_feature_detail(path) # sys.exit() ' 各データ名とそのカラム名をテーブルにする ' # make_data_columns_table() # sys.exit() ' 特徴量セットの構成を検証する→作ったテーブルに使用回数をカウント ' # dcols = pd.read_csv('../data/data_columns_table.csv') # check_feature_elems(dcols, path) # sys.exit() # make_feature_manage_table() # make_individual_feature_set() pred_1 = pd.read_csv( '../submit/20180825_204_submit_lgb_rate0.02_1099features_CV0.8082070133827914_LB0.806_early150_iter20000_regular_dima_params.csv' ).set_index(unique_id).rename(columns={target: f'{target}_cv8082'}) pred_2 = pd.read_csv( '../submit/20180827_072_submit_lgb_rate0.02_1099features_CV0.80606353200866_LB_early150_iter20000_dart.csv' ).set_index(unique_id).rename(columns={target: f'{target}_cv8060_dart'}) pred_3 = pd.read_csv( '../submit/20180825_224_submit_lgb_rate0.02_1099features_CV0.8072030486159842_LB0.808_early150_iter20000_no_regular_dima_params.csv' ).set_index(unique_id).rename(columns={target: f'{target}_cv8072'}) pred = pred_1.join(pred_2).join(pred_3) corr_check(df=pred)
def check_feature_elems(dcols, path): prefix_list = ['a_', 'b_', 'ccb_', 'p_', 'is_', 'pos_', 'abp_', 'ap'] data_list = ['app', 'bureau', 'prev', 'ccb', 'pos', 'is'] base = pd.read_csv('../data/base.csv') df = make_feature_set(base[unique_id].to_frame(), path).set_index(unique_id) feature_arr = df.columns dcols.sort_values(by=['dname', 'length'], ascending=False, inplace=True) ' 各カラムの使用数をカウントする為,辞書でもつ' col_dict = {} for dname in dcols['dname'].drop_duplicates(): col_dict[dname] = dname tmp = {} for col in dcols.query(f"dname=='{dname}'")['column']: tmp[col] = 0 col_dict[dname] = tmp for f in feature_arr: if f[:2] == 'a_' or f[:4] == 'abp_' or f[:3] == 'ap_': dcolumns = dcols.query("dname=='app'")['column'].values dname = 'app' elif f[:2] == 'b_': dcolumns = dcols.query("dname=='bureau'")['column'].values dname = 'bureau' elif f[:2] == 'p_': dcolumns = dcols.query("dname=='prev'")['column'].values dname = 'prev' elif f[:4] == 'ccb_': dcolumns = dcols.query("dname=='ccb'")['column'].values dname = 'ccb' elif f[:4] == 'pos_': dcolumns = dcols.query("dname=='pos'")['column'].values dname = 'pos' elif f[:3] == 'is_': dcolumns = dcols.query("dname=='is'")['column'].values dname = 'is' cnt_col_list = [] ' まずは元データのカラムをチェック ' for col in dcolumns: if f.count(col): logger.info(f'f:{f} dname:{dname} col:{col}') col_dict[dname][col] += 1 cnt_col_list.append(col) ' 残りのデータのカラムをチェック ' tmp_data_list = data_list.copy() tmp_data_list.remove(dname) for dname in tmp_data_list: for col in dcols.query(f"dname=='{dname}'")['column'].values: ' 同じカラム名は二度カウントさせない ' if col in cnt_col_list: continue if f.count(col): logger.info(f'f:{f} dname:{dname} col:{col}') col_dict[dname][col] += 1 cnt_col_list.append(col) result = pd.DataFrame(col_dict).T.stack().reset_index().rename(columns={ 'level_0': 'dname', 'level_1': 'feature', 0: 'cnt' }) result.to_csv(f'../eda/{start_time[:11]}_feature_set_elems.csv', index=False)
def main(): prefix = '' level = [unique_id] base = pd.read_csv('../data/base.csv') ' 学習に使うfeature_setをmerge ' path = '../features/3_winner/*.npy' dataset = make_feature_set(base, path) dataset_columns = list(dataset.columns) # dataset.set_index(level, inplace=True) # logger.info(f'\nconcat end\ndataset shape: {dataset.shape}') ' imputeする連続値のカラムリスト ' value_path_list = glob.glob('../features/*.csv') ' 特徴量を欠損値補完してnpyに保存する ' score_list = [] impute_list = [] null_len_list = [] ' 作成済featureが格納されたパス(同じfeatureは除くため) ' path_list = glob.glob('../features/1_first_valid/*.npy') extract_list = [] for path in path_list: if path.count('impute'): ' _imputeを除いたfeature_name ' filename = re.search(r'/([^/.]*).npy', path).group(1)[:-7] extract_list.append(filename) for value_path in value_path_list: if value_path.count('npy'): value = re.search(r'/([^/.]*).npy', value_path).group(1) elif value_path.count('csv'): value = re.search(r'/([^/.]*).csv', value_path).group(1) ' 既に作成済のfeatureは作らない ' if value in extract_list: logger.info(f'{value} is already exist.') continue ' データセットに含まれない特徴量は追加する ' if value not in dataset_columns: if value_path.count('npy'): dataset[value] = np.load(value_path) elif value_path.count('csv'): base = pd.read_csv('../data/base.csv') tmp = pd.read_csv(value_path) dataset[value] = base[unique_id].to_frame().merge( tmp, on=unique_id, how='left')['TARGET'] null_len = len(dataset[value]) - len(dataset[value].dropna()) cv_score = impute_regression(base, level, dataset, value, prefix) if value not in dataset_columns: dataset.drop(value, axis=1, inplace=True) impute_list.append(value) score_list.append(cv_score) null_len_list.append(null_len) if len(impute_list) > 2: result = pd.Series(impute_list, name='feature').to_frame() result['r2_score'] = score_list result['null_len'] = null_len_list result.to_csv( f'../output/{start_time[:11]}_impute_reg_feature_score.csv', index=False)