def gen_fea(base_tr_path=None, base_te_path=None): tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') df_base = pd.concat([tr, te]) #df_base = filter_acts_after_last_clk(df_base) df_feat = feat_extract(df_base) loader.save_df(df_feat, '../feature/df_feat.ftr') tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') tr = tr_sample[ID_NAMES].merge(df_feat, on=ID_NAMES, how='left') te = te_sample[ID_NAMES].merge(df_feat, on=ID_NAMES, how='left') print (tr.shape, te.shape) print (tr.head()) print (te.head()) print (tr.columns) #tr = df_base[pd.notnull(df_base['target'])].reset_index(drop=True) #te = df_base[pd.isnull(df_base['target'])].reset_index(drop=True) output_fea(tr, te)
def gen_sample(ori, des): df = loader.load_df(ori) print(df.shape) df = df[df.action_type == 'clickout item'] print(df.shape) df_out = explode(df, ['impressions', 'prices']) print(df_out.shape) loader.save_df(df_out, des)
def gen_tr_feat(): df = loader.load_df('../input/sample_train.ftr') df['reference'] = df['reference'].astype('int') df['target'] = (df['reference'] == df['impressions']).astype(int) df.drop(['reference','action_type'],axis=1,inplace=True) df_session = df[['session_id','step']].drop_duplicates(subset='session_id',keep='last').reset_index(drop=True) df = df_session.merge(df, on=['session_id','step'], how='left').reset_index(drop=True) loader.save_df(df,'../input/tr.ftr')
def merge_fea(tr_list, te_list): tr = loader.merge_fea(tr_list, primary_keys=ID_NAMES) te = loader.merge_fea(te_list, primary_keys=ID_NAMES) print(tr.head()) print(te.head()) loader.save_df(tr, tr_out_path) loader.save_df(te, te_out_path)
def merge_fea(tr_list, te_list): tr = loader.merge_fea(tr_list, primary_keys=ID_NAMES) te = loader.merge_fea(te_list, primary_keys=ID_NAMES) tr['impressions'] = tr['impressions'].astype('int') te['impressions'] = te['impressions'].astype('int') print(tr.head()) print(te.head()) print(tr[ID_NAMES].head()) loader.save_df(tr, tr_out_path) loader.save_df(te, te_out_path)
def filter_useless_data(): tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') tr = remove_repeated_session_in_tr(tr, te) tr = remove_invalid_reference(tr) te = remove_invalid_reference(te) tr = remove_acts_after_last_clk(tr, is_te=False) te = remove_acts_after_last_clk(te, is_te=True) loader.save_df(tr, '../input/train.ftr') loader.save_df(te, '../input/test.ftr')
def merge_val(file_path, sub_name, fold_num): file_list = os.listdir(file_path) paths = ['{}_cv_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)] print (paths) dfs = [] for path in paths: assert path in file_list, '{} not exist'.format(path) path = '{}/{}'.format(file_path, path) dfs.append(loader.load_df(path)) df = pd.concat(dfs) print (df.head()) print (df.describe()) out_path = '{}/{}_cv.ftr'.format(file_path, sub_name) loader.save_df(df, out_path)
def sub_convert(df_path, pred_path, out_path1, out_path2): te_data = loader.load_df(df_path) df_pred = loader.load_df(pred_path) sort_df_pred = df_pred.sort_values(['description_id', 'target'], ascending=False) df_pred = df_pred[['description_id']].drop_duplicates() \ .merge(sort_df_pred, on=['description_id'], how='left') df_pred['rank'] = df_pred.groupby('description_id').cumcount().values df_pred = df_pred[df_pred['rank'] < 3] df_pred = df_pred.groupby(['description_id'])['paper_id'] \ .apply(lambda s : ','.join((s))).reset_index() df_pred = te_data[['description_id']].merge(df_pred, on=['description_id'], how='left') loader.save_df(df_pred, out_path1)
def output_fea(tr, te): # 特征重排,保证输出顺序一致 # ... # 特征文件只保留主键 & 本次新增特征 #primary_keys = ['session_id', 'impressions'] #fea_cols = [] #required_cols = primary_keys + fea_cols # 特征输出 #tr = tr[required_cols] #te = te[required_cols] print(tr.head()) print(te.head()) loader.save_df(tr, tr_fea_out_path) loader.save_df(te, te_fea_out_path)
def merge_sub(file_path, sub_name, fold_num): file_list = os.listdir(file_path) paths = ['{}_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)] print (paths) df = pd.DataFrame() for i, path in enumerate(paths): assert path in file_list, '{} not exist'.format(path) path = '{}/{}'.format(file_path, path) if i == 0: df = loader.load_df(path) else: df[TARGET_NAME] += loader.load_df(path)[TARGET_NAME] df[TARGET_NAME] /= fold_num print (df.head()) print (df.describe()) out_path = '{}/{}.ftr'.format(file_path, sub_name) loader.save_df(df, out_path)
def merge_fea(tr_list, te_list): tr = loader.merge_fea_v2(tr_list, primary_keys=ID_NAMES) te = loader.merge_fea_v2(te_list, primary_keys=ID_NAMES) tr['impressions'] = tr['impressions'].astype('int') te['impressions'] = te['impressions'].astype('int') tr_sample = loader.load_df('../feature/tr_s0_0.ftr') tr_sample = tr_sample[ID_NAMES + ['cv']] tr = tr.merge(tr_sample, on=ID_NAMES, how='left') te['cv'] = 0 print(tr.head()) print(te.head()) print(tr[ID_NAMES].head()) loader.save_df(tr, tr_out_path) loader.save_df(te, te_out_path)
def gen_fea(base_tr_path=None, base_te_path=None): tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') df_base = pd.concat([tr, te]) df_feat = feat_extract(df_base) loader.save_df(df_feat, '../feature/df_feat.ftr') tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') tr = tr_sample[ID_NAMES].merge(df_feat, on='session_id', how='left') te = te_sample[ID_NAMES].merge(df_feat, on='session_id', how='left') print(tr.shape, te.shape) print(tr.head()) print(te.head()) print(tr.columns) output_fea(tr, te)
def gen_samples(paper, tr_desc_path, tr_recall_path, fea_out_path): tr_desc = loader.load_df(tr_desc_path) tr = loader.load_df(tr_recall_path) # tr = tr.head(1000) tr = tr.merge(paper, on=['paper_id'], how='left') tr = tr.merge(tr_desc[['description_id', 'quer_key', 'quer_all']], on=['description_id'], how='left') print(tr.columns) print(tr.head()) tr_feat = multi_process_feat(tr) loader.save_df(tr_feat, fea_out_path) tr = tr.merge(tr_feat, on=ID_NAMES, how='left') del_cols = [ col for col in tr.columns if tr[col].dtype == 'O' and col not in ID_NAMES ] print('tr del cols', del_cols) return tr.drop(del_cols, axis=1)
df = loader.load_df(in_path) df = topk_lines(df, k) df['sim_score'] = df['sim_score'].astype('float') df.rename(columns={'sim_score': 'corp_sim_score'}, inplace=True) return df if __name__ == "__main__": ts = time.time() tr_path = '../../feat/tr_tfidf_30.ftr' te_path = '../../feat/te_tfidf_30.ftr' cv = loader.load_df('../../input/cv_ids_0109.csv')[['description_id', 'cv']] tr = process(tr_path, k=50) tr = tr.merge(cv, on=['description_id'], how='left') te = process(te_path, k=50) te['cv'] = 0 loader.save_df(tr, '../../feat/tr_samples_30-50.ftr') loader.save_df(te, '../../feat/te_samples_30-50.ftr') print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
def output_fea(tr, te): print(tr.head()) print(te.head()) loader.save_df(tr, tr_fea_out_path) loader.save_df(te, te_fea_out_path)
# 增加 vec sim 特征 if __name__ == "__main__": ts = time.time() tqdm.pandas() print('start time: %s' % datetime.now()) paper = loader.load_df('../../input/paper_input_final.ftr') paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', '')) paper['corp'] = paper['abst'] + ' ' + paper['titl'] + ' ' + paper[ 'keywords'].fillna('').replace(';', ' ') tr_desc_path = '../../input/tr_input_final.ftr' te_desc_path = '../../input/te_input_final.ftr' tr_recall_path = '../../feat/tr_s0_30-50.ftr' te_recall_path = '../../feat/te_s0_30-50.ftr' tr = gen_samples(paper, tr_desc_path, tr_recall_path, tr_fea_out_path) print(tr.columns) print([col for col in tr.columns if tr[col].dtype == 'O']) loader.save_df(tr, tr_out_path) te = gen_samples(paper, te_desc_path, te_recall_path, te_fea_out_path) print(te.columns) loader.save_df(te, te_out_path) print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))
def gen_tr_click(): df = loader.load_df('../input/sample_train.ftr') df = df[['session_id','reference']].drop_duplicates(subset='session_id',keep='last').reset_index(drop=True) print(df.shape) loader.save_df(df,'../input/tr_click.ftr')
def get_te_feat(): df = loader.load_df('../input/sample_test.ftr') df = df[pd.isnull(df['reference'])].reset_index(drop=True) print(df.shape) df.drop(['reference','action_type'],axis=1,inplace=True) loader.save_df(df,'../input/te.ftr')
tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') tr = remove_repeated_session_in_tr(tr, te) tr = remove_invalid_reference(tr) te = remove_invalid_reference(te) tr = remove_acts_after_last_clk(tr, is_te=False) te = remove_acts_after_last_clk(te, is_te=True) loader.save_df(tr, '../input/train.ftr') loader.save_df(te, '../input/test.ftr') if __name__ == "__main__": print('start time: %s' % datetime.now()) tr = loader.load_df('../../../input/train.csv') te = loader.load_df('../../../input/test.csv') item_meta = loader.load_df('../../../input/item_metadata.csv') loader.save_df(tr, '../input/train.ftr') loader.save_df(te, '../input/test.ftr') loader.save_df(item_meta, '../input/item_metadata.ftr') filter_useless_data() print('all completed: %s' % datetime.now())
'prices_div_active_items-session_id_by_prices_median-v2', 'price_rank', 'act_pre1', 'lastest_item-impr_rank', 'impr_rank_sub_impressions_by_impr_rank_median', 'price_div', 'session_act_sum', 'prices_div_active_items-session_id_by_prices_median', 'impressions_by_hist_interaction item info_sum', 'impressions_by_hist_interaction item image_sum', 'impressions_by_hist_clickout item_sum', 'session_id_by_prices_count', 'impressions_target', 'impressions_active_ratio', 'price_div_impr_rank_1_price', 'impressions_target_sub_session_median', 'impressions_target_sub_session_max', 'session_hist_clickout item', 'device', 'impr_rank_sub_session_id_by_impr_rank_median', 'session_hist_interaction item image', 'impr_rank_1_impressions_target', 'impr_rank_sub_session_id_by_impr_rank_max', 'impr_rank_sub_session_id_by_impr_rank_min', 'current_filters' ] tr = tr[cols] te = te[cols] tr.columns = ['session_id', 'impressions'] + \ ['m2_{}'.format(c) for c in tr.columns.tolist()[2:]] te.columns = ['session_id', 'impressions'] + \ ['m2_{}'.format(c) for c in te.columns.tolist()[2:]] loader.save_df(tr, '../../../feat/m2_tr_top30_fea.ftr') loader.save_df(te, '../../../feat/m2_te_top30_fea.ftr') print('all completed: %s' % datetime.now())
tr = loader.load_df(input_root_path + 'tr_input_final.ftr') tr = tr[~pd.isnull(tr['description_id'])] # tr = tr.head(1000) tr_desc, tr_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in tr['quer_all'].tolist()], \ tr['description_id'].tolist() print('gen tf completed, cost {}s'.format(np.round( time.time() - ts, 2))) tr_samples = gen_samples(tr, tr_desc, tr_desc_ids, \ corpus_list, paper_ids_list, k=50) tr_samples = tr.rename(columns={'paper_id': 'target_paper_id'}) \ .merge(tr_samples, on='description_id', how='left') tr_samples.loc[tr_samples['target_paper_id'] == tr_samples['paper_id'], 'target'] = 1 loader.save_df(tr_samples[out_cols], train_out_path) print('recall succ {} from {}'.format(tr_samples['target'].sum(), tr.shape[0])) print(tr.shape, tr_samples.shape) if sys.argv[1] in ['te']: # for te ins te = loader.load_df(input_root_path + 'te_input_final.ftr') te = te[~pd.isnull(te['description_id'])] # te = te.head(1000) te_desc, te_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in te['quer_all'].tolist()], \ te['description_id'].tolist() print('gen tf completed, cost {}s'.format(np.round( time.time() - ts, 2)))