def gen_samples(paper, tr_desc_path, tr_recall_path): tr_desc = loader.load_df(tr_desc_path) tr = loader.load_df(tr_recall_path) # tr = tr.head(1000) print(tr.head()) tr_feat = tr[ID_NAMES].merge(paper, on=['paper_id'], how='left') tr_feat = tr_feat.merge(tr_desc[['description_id', 'quer_key', 'quer_all']], on=['description_id'], how='left') tr_feat = multi_process_feat(tr_feat) del_cols = [ col for col in tr_feat.columns if 'keyw' not in col and col not in ID_NAMES ] tr_feat.drop(del_cols, axis=1, inplace=True) print(tr_feat.head()) tr = tr.merge(tr_feat, on=ID_NAMES, how='left') print(tr.head()) print(len(tr.columns), tr.columns.tolist()) del_cols = [ col for col in tr.columns if tr[col].dtype == 'O' and col not in ID_NAMES ] print('tr del cols', del_cols) return tr.drop(del_cols, axis=1)
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_sample = pd.concat([tr, te]) actions = ['interaction item image', 'interaction item info', \ 'interaction item deals', 'interaction item rating', \ 'search for item', 'clickout item'] df = df[df.action_type.isin(actions)] df = df[~pd.isnull(df.reference)] df = df[['session_id', 'reference', 'step']] df.columns = ID_NAMES + ['step'] df['impressions'] = df['impressions'].astype('int') df = df.merge(df_sample[['session_id', 'step']].drop_duplicates(), \ on='session_id', how='left') # 过滤掉最后一次 clk 样本 df = df[df.step_x < df.step_y] df_feat = df_sample[ID_NAMES] \ .merge(df[ID_NAMES + ['step_x']].drop_duplicates(subset=ID_NAMES), \ on=ID_NAMES, how='left') df_feat['actived'] = df_feat['step_x'].fillna(0) df_feat['actived'] = list(df_feat['actived'].apply(lambda x: min(x, 1))) print(df_feat['actived'].describe()) df_feat['impressions_active_ratio'] = \ df_feat.groupby('impressions')['actived'].transform('mean') df_feat = df_feat[ID_NAMES + ['impressions_active_ratio']] print(df_feat.shape) print(df_feat.head()) print(df_feat.columns.tolist()) return df_feat
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_sample = pd.concat([tr, te]) df_feat = df_sample[ID_NAMES].drop_duplicates() actions = ['interaction item image', 'interaction item info', \ 'interaction item deals', 'interaction item rating', \ 'search for item', 'clickout item'] df = df[df.action_type.isin(actions)] df = df[~pd.isnull(df.reference)] df = df[['session_id', 'reference', 'step']] df.columns = ID_NAMES + ['step'] df['impressions'] = df['impressions'].astype('int') df = df.merge(df_sample[['session_id', 'step']].drop_duplicates(), \ on='session_id', how='left') # 过滤掉最后一次 clk 样本 df = df[df.step_x < df.step_y] df = df[ID_NAMES].drop_duplicates(subset=['session_id'], keep='last') df.columns = ['session_id', 'act_item_pre_1'] df = df_feat.merge(df, on='session_id', how='left') df_sim = loader.load_df('../../../feat/m1_similarity_all.ftr') df_sim.columns = ['impressions', 'act_item_pre_1'] + \ ['act_item_pre_1-{}'.format(c) for c in df_sim.columns.tolist()[2:]] df_feat = df.merge(df_sim, \ on=['impressions', 'act_item_pre_1'], how='left') print(df_feat.shape) print(df_feat.head()) print(df_feat.columns.tolist()) return df_feat
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_feat = pd.concat([tr, te]) df_feat = df_feat[['session_id', 'impressions', 'step']].drop_duplicates() actions = ['interaction item image', 'interaction item info', \ 'interaction item deals', 'interaction item rating', \ 'search for item', 'clickout item'] df = df[df.action_type.isin(actions)] df = df[~pd.isnull(df.reference)] df = df[['session_id', 'reference', 'step', 'action_type']] df.columns = ID_NAMES + ['step', 'action_type'] df['impressions'] = df['impressions'].astype('int') df = df.merge(df_feat[['session_id', 'step']].drop_duplicates(), \ on='session_id', how='left') # 过滤掉最后一次 clk 样本 df = df[df.step_x < df.step_y] df_feat = custom_cate_encoding.gen_hist_feat(df, \ ID_NAMES, 'action_type', ratio=False) print(df_feat.head()) print(df_feat.columns.tolist()) df_feat.columns = ID_NAMES + ['hist_{}' \ .format(val) for val in df_feat.columns.tolist()[2:]] print(df_feat.shape) print(df_feat.head()) print(df_feat.columns.tolist()) return df_feat
def gen_fea(base_tr_path=None, base_te_path=None): tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') #tr = loader.load_df('../input/tr.ftr') #te = loader.load_df('../input/te.ftr') #tr = loader.load_df('../feature/tr_s0_9.ftr') #te = loader.load_df('../feature/te_s0_9.ftr') #tr = loader.load_df('../feature/tr_fea_s0_1.ftr') #te = loader.load_df('../feature/te_fea_s0_1.ftr') #tr = tr.head(1000) #te = te.head(1000) df_base = pd.concat([tr, te]) df_feat = feat_extract(df_base) tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') merge_keys = ['session_id', 'impressions'] #merge_keys = ['session_id'] tr = tr_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left') te = te_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left') print(tr.shape, te.shape) print(tr.head()) print(te.head()) print(tr.columns) output_fea(tr, te)
def gen_fea(base_tr_path=None, base_te_path=None): tr_sample = loader.load_df('../../feat/tr_s0_37.ftr') te_sample = loader.load_df('../../feat/te_s0_37.ftr') prefixs = ['m1_cat_03', 'm1_infesent_simple', 'm1_nn_02', \ 'm2_ESIM_001', 'm2_ESIMplus_001', 'lgb_m3_37-0'] tr_paths = ['{}_tr.ftr'.format(prefix) for prefix in prefixs] te_paths = ['final_{}_te.ftr'.format(prefix) for prefix in prefixs] tr_paths = ['../../stk_feat/{}'.format(p) for p in tr_paths] te_paths = ['../../stk_feat/{}'.format(p) for p in te_paths] trs, tes = [], [] for i, prefix in enumerate(prefixs): tr, te = feat_extract(tr_paths[i], te_paths[i], prefix + '_prob') trs.append(tr) tes.append(te) tr = pd.concat([tr_sample[ID_NAMES]] + trs, axis=1) te = pd.concat([te_sample[ID_NAMES]] + tes, axis=1) float_cols = [c for c in tr.columns if tr[c].dtype == 'float'] tr[float_cols] = tr[float_cols].astype('float32') te[float_cols] = te[float_cols].astype('float32') print(tr.shape, te.shape) print(tr.head()) print(te.head()) print(tr.columns) output_fea(tr, te)
def feat_extract(tr_path, te_path, prefix): tr_sample = loader.load_df('../../feat/tr_s0_37.ftr') te_sample = loader.load_df('../../feat/te_s0_37.ftr') tr = loader.load_df(tr_path) te = loader.load_df(te_path) del_cols = ['label'] del_cols = [col for col in tr.columns if col in del_cols] tr.drop(del_cols, axis=1, inplace=True) tr = tr_sample[ID_NAMES].merge(tr, on=ID_NAMES, how='left') te = te_sample[ID_NAMES].merge(te, on=ID_NAMES, how='left') tr.columns = ID_NAMES + [prefix] te.columns = ID_NAMES + [prefix] print(prefix) print(tr.shape, te.shape) print(tr.head()) tr = tr[prefix] te = te[prefix] return tr, te
def gen_fea(base_tr_path=None, base_te_path=None): tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') df_base = pd.concat([tr, te]) #df_base = filter_acts_after_last_clk(df_base) df_feat = feat_extract(df_base) loader.save_df(df_feat, '../feature/df_feat.ftr') tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') tr = tr_sample[ID_NAMES].merge(df_feat, on=ID_NAMES, how='left') te = te_sample[ID_NAMES].merge(df_feat, on=ID_NAMES, how='left') print (tr.shape, te.shape) print (tr.head()) print (te.head()) print (tr.columns) #tr = df_base[pd.notnull(df_base['target'])].reset_index(drop=True) #te = df_base[pd.isnull(df_base['target'])].reset_index(drop=True) output_fea(tr, te)
def filter_acts_after_last_clk(df): tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') df_sample = pd.concat([tr_sample, te_sample]) df_sample = df_sample[['session_id','step']].drop_duplicates() df = df.merge(df_sample, on='session_id', how='left') print(df.head(10)) df = df[df.step_x < df.step_y] return df
def gen_fea(base_tr_path=None, base_te_path=None): tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') #tr = loader.load_df('../input/tr.ftr') #te = loader.load_df('../input/te.ftr') #tr = loader.load_df('../feature/tr_s0_9.ftr') #te = loader.load_df('../feature/te_s0_9.ftr') #tr = loader.load_df('../feature/tr_fea_s0_1.ftr') #te = loader.load_df('../feature/te_fea_s0_1.ftr') #tr = tr.head(1000) #te = te.head(1000) df_base = pd.concat([tr, te]) tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') actions = [['interaction item image', 'interaction item info', \ 'interaction item deals', 'interaction item rating', \ 'search for item'], ['clickout item']] prefixs = ['normal_active', 'clickout'] merge_keys = ['session_id', 'impressions'] trs, tes = [], [] for i, acts in enumerate(actions): df_feat = feat_extract(df_base, acts, prefixs[i]) cur_tr = tr_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left') cur_te = te_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left') trs.append(cur_tr) tes.append(cur_te) tr, te = trs[0], tes[0] for i in range(1, len(trs)): tr = pd.concat([tr, trs[i].drop(ID_NAMES, axis=1)], axis=1) te = pd.concat([te, tes[i].drop(ID_NAMES, axis=1)], axis=1) float_cols = [c for c in tr.columns if tr[c].dtype == 'float'] tr[float_cols] = tr[float_cols].astype('float32') te[float_cols] = te[float_cols].astype('float32') print(tr.shape, te.shape) print(tr.head()) print(te.head()) print(tr.columns) output_fea(tr, te)
def filter_useless_data(): tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') tr = remove_repeated_session_in_tr(tr, te) tr = remove_invalid_reference(tr) te = remove_invalid_reference(te) tr = remove_acts_after_last_clk(tr, is_te=False) te = remove_acts_after_last_clk(te, is_te=True) loader.save_df(tr, '../input/train.ftr') loader.save_df(te, '../input/test.ftr')
def gen_fea(): tr = loader.load_df('../../feat/tr_s0_32-50.ftr') te = loader.load_df('../../feat/te_s0_32-50.ftr') tr_feat = feat_extract(tr[ID_NAMES]) te_feat = feat_extract(te[ID_NAMES], is_te=True) tr = tr[ID_NAMES].merge(tr_feat, on=['description_id'], how='left') te = te[ID_NAMES].merge(te_feat, on=['description_id'], how='left') print(tr.shape, te.shape) print(tr.head()) print(te.head()) print(tr.columns) output_fea(tr, te)
def gen_sample(ori, des): df = loader.load_df(ori) print(df.shape) df = df[df.action_type == 'clickout item'] print(df.shape) df_out = explode(df, ['impressions', 'prices']) print(df_out.shape) loader.save_df(df_out, des)
def gen_fea(base_tr_path=None, base_te_path=None): #tr = loader.load_df('../input/train.ftr') #te = loader.load_df('../input/test.ftr') tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') #tr = loader.load_df('../feature/tr_s0_9.ftr') #te = loader.load_df('../feature/te_s0_9.ftr') #tr = loader.load_df('../feature/tr_fea_s0_1.ftr') #te = loader.load_df('../feature/te_fea_s0_1.ftr') #tr = tr.head(1000) #te = te.head(1000) df_base = pd.concat([tr, te]) df_feat = feat_extract(df_base) tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') #merge_keys = ['session_id', 'impressions'] #merge_keys = ['session_id'] merge_keys = ['impressions'] add_keys = ['prices', 'impr_rank'] tr = tr_sample[ID_NAMES + add_keys] \ .merge(df_feat, on=merge_keys, how='left') te = te_sample[ID_NAMES + add_keys] \ .merge(df_feat, on=merge_keys, how='left') add_meta_fea(tr) add_meta_fea(te) tr.drop(add_keys, axis=1, inplace=True) te.drop(add_keys, axis=1, inplace=True) float_cols = [c for c in tr.columns if tr[c].dtype == 'float'] tr[float_cols] = tr[float_cols].astype('float32') te[float_cols] = te[float_cols].astype('float32') print(tr.shape, te.shape) print(tr.head()) print(te.head()) print(tr.columns) output_fea(tr, te)
def gen_tr_feat(): df = loader.load_df('../input/sample_train.ftr') df['reference'] = df['reference'].astype('int') df['target'] = (df['reference'] == df['impressions']).astype(int) df.drop(['reference','action_type'],axis=1,inplace=True) df_session = df[['session_id','step']].drop_duplicates(subset='session_id',keep='last').reset_index(drop=True) df = df_session.merge(df, on=['session_id','step'], how='left').reset_index(drop=True) loader.save_df(df,'../input/tr.ftr')
def process(in_path, k): ID_NAMES = ['description_id', 'paper_id'] df = loader.load_df(in_path) df = topk_lines(df, k) df['sim_score'] = df['sim_score'].astype('float') df.rename(columns={'sim_score': 'corp_sim_score'}, inplace=True) return df
def sub_convert(df_path, pred_path, out_path1, out_path2): te_data = loader.load_df(df_path) df_pred = loader.load_df(pred_path) sort_df_pred = df_pred.sort_values(['description_id', 'target'], ascending=False) df_pred = df_pred[['description_id']].drop_duplicates() \ .merge(sort_df_pred, on=['description_id'], how='left') df_pred['rank'] = df_pred.groupby('description_id').cumcount().values df_pred = df_pred[df_pred['rank'] < 3] df_pred = df_pred.groupby(['description_id'])['paper_id'] \ .apply(lambda s : ','.join((s))).reset_index() df_pred = te_data[['description_id']].merge(df_pred, on=['description_id'], how='left') loader.save_df(df_pred, out_path1)
def sub_convert(df_path, pred_path, out_path): df_data = loader.load_df(df_path) df_pred = loader.load_df(pred_path) required_cols = [ 'user_id', 'session_id', 'timestamp', 'step', 'impressions' ] df_sub = df_data[required_cols] df_sub['target'] = df_pred['target'] df_sub = df_sub.sort_values(by=['session_id', 'target'], \ ascending=False).reset_index(drop=True) df_sub['item_recommendations'] = df_sub['impressions'].astype(str) df_sub = df_sub.groupby(['user_id','session_id','timestamp','step']) \ ['item_recommendations'].apply(lambda lst : ' '.join((lst))).reset_index() df_sub = df_sub.sort_values(by='session_id').reset_index(drop=True) df_sub.to_csv(out_path, float_format='%.4f', index=False)
def cv_convert(pred_path, out_path): df_pred = loader.load_df(pred_path) df_pred = df_pred.sort_values(['session_id', 'target'], ascending=False) df_pred['rank'] = df_pred.groupby('session_id').cumcount() df_pred = df_pred.sort_values(by=['session_id', 'rank']).reset_index( drop=True) df_pred.to_csv(out_path, float_format='%.4f', index=False)
def gen_fea(base_tr_path=None, base_te_path=None): #tr = loader.load_df('../input/train.ftr') #te = loader.load_df('../input/test.ftr') #tr = loader.load_df('../input/tr.ftr') #te = loader.load_df('../input/te.ftr') #tr = loader.load_df('../feature/tr_s0_0.ftr') #te = loader.load_df('../feature/te_s0_0.ftr') #tr = loader.load_df('../feature/tr_fea_s0_1.ftr') #te = loader.load_df('../feature/te_fea_s0_1.ftr') #tr = tr.head(1000) #te = te.head(1000) #df_base = pd.concat([tr, te]) df_base = loader.load_df('../input/item_metadata.ftr') #df_base = df_base.head(1000) df_feat = feat_extract(df_base) tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') merge_keys = ['session_id', 'impressions'] #merge_keys = ['session_id'] #merge_keys = ['impressions'] tr = tr_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left') te = te_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left') float_cols = [c for c in tr.columns if tr[c].dtype == 'float'] tr[float_cols] = tr[float_cols].astype('float32') te[float_cols] = te[float_cols].astype('float32') print(tr.shape, te.shape) print(tr.head()) print(te.head()) print(tr.columns) output_fea(tr, te)
def merge_sub(file_path, sub_name, fold_num): file_list = os.listdir(file_path) paths = ['{}_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)] print (paths) df = pd.DataFrame() for i, path in enumerate(paths): assert path in file_list, '{} not exist'.format(path) path = '{}/{}'.format(file_path, path) if i == 0: df = loader.load_df(path) else: df[TARGET_NAME] += loader.load_df(path)[TARGET_NAME] df[TARGET_NAME] /= fold_num print (df.head()) print (df.describe()) out_path = '{}/{}.ftr'.format(file_path, sub_name) loader.save_df(df, out_path)
def gen_samples(paper, tr_desc_path, tr_recall_path): tr_desc = loader.load_df(tr_desc_path) tr = loader.load_df(tr_recall_path) tr = tr.head(1000) tr = tr.merge(paper, on=['paper_id'], how='left') tr = tr.merge(tr_desc[['description_id', 'quer_key', 'quer_all']], on=['description_id'], how='left') print(tr.columns) print(tr.head()) tr = multi_process_feat(tr) del_cols = [ col for col in tr.columns if tr[col].dtype == 'O' and col not in ID_NAMES ] print('tr del cols', del_cols) return tr.drop(del_cols, axis=1)
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_feat = pd.concat([tr, te]) df_feat = df_feat[['session_id', 'step']].drop_duplicates() df_feat['step'] -= 1 df = df[['session_id', 'step', 'action_type', 'reference']] \ .drop_duplicates(subset=['session_id', 'step']) df_feat = df_feat.merge(df, on=['session_id', 'step'], how='left') print (df_feat.head()) df_feat['act_pre1'] = pd.factorize(df_feat['action_type'], sort=True)[0] df_feat = df_feat[['session_id', 'act_pre1']] print (df_feat.shape) print (df_feat.head()) print (df_feat.columns.tolist()) return df_feat
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_sample = pd.concat([tr, te]) actions = ['interaction item image', 'interaction item info', \ 'interaction item deals', 'interaction item rating', \ 'search for item', 'clickout item'] df = df[df.action_type.isin(actions)] df = df[~pd.isnull(df.reference)] df = df[['session_id', 'reference', 'step']] df.columns = ID_NAMES + ['step'] df['impressions'] = df['impressions'].astype('int') df = df.merge(df_sample[['session_id', 'step']].drop_duplicates(), \ on='session_id', how='left') # 过滤掉最后一次 clk 样本 df = df[df.step_x < df.step_y] tr = loader.load_df('../input/sample_train.ftr') te = loader.load_df('../input/sample_test.ftr') df_sample_all = pd.concat([tr, te]) df_feat = df[ID_NAMES + ['step_x']].drop_duplicates(subset=ID_NAMES) \ .merge(df_sample_all[ID_NAMES + ['prices']], on=ID_NAMES, how='left') print(df_feat.head()) print('filter', df_feat.shape) sub_df = df_feat[~pd.isnull(df_feat['step_x'])] print('filter', sub_df.shape) df_feat = df_sample[['session_id']].drop_duplicates() df_feat = cate_encoding.cate_num_stat(sub_df, df_feat, ['session_id'], \ 'prices', ['max', 'min', 'median', 'std']) df_feat.columns = ['session_id'] + \ ['active_items-{}-v2'.format(c) for c in df_feat.columns.tolist()[1:]] print(df_feat.shape) print(df_feat.head()) print(df_feat.columns.tolist()) return df_feat
def gen_fea(base_tr_path=None, base_te_path=None): tr = loader.load_df('../input/train.ftr') te = loader.load_df('../input/test.ftr') df_base = pd.concat([tr, te]) df_feat = feat_extract(df_base) loader.save_df(df_feat, '../feature/df_feat.ftr') tr_sample = loader.load_df('../feature/tr_s0_0.ftr') te_sample = loader.load_df('../feature/te_s0_0.ftr') tr = tr_sample[ID_NAMES].merge(df_feat, on='session_id', how='left') te = te_sample[ID_NAMES].merge(df_feat, on='session_id', how='left') print(tr.shape, te.shape) print(tr.head()) print(te.head()) print(tr.columns) output_fea(tr, te)
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_sample = pd.concat([tr, te]) df_sample['impr_rank'] = df_sample.groupby(['session_id' ]).cumcount().values actions = ['interaction item image', 'interaction item info', \ 'interaction item deals', 'interaction item rating', \ 'search for item', 'clickout item'] df = df[df.action_type.isin(actions)] df = df[~pd.isnull(df.reference)] df = df[['session_id', 'reference', 'step']] df.columns = ID_NAMES + ['step'] df['impressions'] = df['impressions'].astype('int') df = df.merge(df_sample[['session_id', 'step']].drop_duplicates(), \ on='session_id', how='left') # 过滤掉最后一次 clk 样本 df = df[df.step_x < df.step_y] df = df.drop_duplicates(subset=['session_id'], keep='last') df_feat = df_sample[ID_NAMES + ['impr_rank']] \ .merge(df[ID_NAMES + ['step_x']] \ .drop_duplicates(subset=ID_NAMES), on=ID_NAMES, how='left') print(df_feat.head()) print('filter', df_feat.shape) sub_df = df_feat[~pd.isnull(df_feat['step_x'])] sub_df = sub_df[['session_id', 'impr_rank']] sub_df.columns = ['session_id', 'lastest_item-impr_rank'] print('filter', sub_df.shape) df_feat = df_sample[['session_id']].drop_duplicates() df_feat = df_feat.merge(sub_df, on='session_id', how='left') print(df_feat.shape) print(df_feat.head()) print(df_feat.columns.tolist()) return df_feat
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_sample = pd.concat([tr, te]) df['star'] = df.properties.apply(lambda s : \ ''.join([v for v in s.split('|') if 'Star' in v and 'Stars' not in v])) df['star'] = df['star'].apply(lambda s: s.split(' ')[0]) df['star'].replace('', 0, inplace=True) print(df['star'].value_counts()) df_star = df[['item_id', 'star']].drop_duplicates(subset=['item_id']) df_star.columns = ['impressions', 'star'] df_star['star'] = df_star['star'].astype('int') df_feat = df_sample[ID_NAMES + ['prices']].drop_duplicates(subset=ID_NAMES) \ .merge(df_star, on='impressions', how='left') df_feat = cate_encoding.cate_num_stat(df_feat, df_feat, \ ['session_id'], 'star', ['max', 'median', 'std']) df_feat['star_sub_session_max'] = \ df_feat['star'] - df_feat['session_id_by_star_max'] df_feat['star_sub_session_median'] = \ df_feat['star'] - df_feat['session_id_by_star_median'] df_feat['star_div_prices'] = df_feat['star'] * 100 / df_feat['prices'] print(df_feat['star_div_prices'].describe()) df_feat = cate_encoding.cate_num_rank(df_feat, \ ['session_id'], 'star_div_prices', ascending=False) del df_feat['prices'] print('df_feat info') print(df_feat.shape) print(df_feat.head()) print(df_feat.columns.tolist()) return df_feat
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_sample = pd.concat([tr, te]) df['ts_prev'] = df.groupby('session_id')['timestamp'].shift(1) df['ts_sub_prev'] = df['timestamp'] - df['ts_prev'] df = df[['session_id', 'step', 'ts_sub_prev']] df_feat = df_sample[['session_id', 'step']].drop_duplicates() \ .merge(df, on=['session_id', 'step'], how='left') df_feat = df_feat[['session_id', 'ts_sub_prev']] df_feat = df_feat.drop_duplicates(subset=['session_id']) print(df_feat['ts_sub_prev'].describe()) print(df_feat[df_feat['ts_sub_prev'] < 0].shape) #df_feat['ts_sub_prev'] = df_feat['ts_sub_prev'].apply(lambda x : max(x, 0)) #print (df_feat['ts_sub_prev'].describe()) print(df_feat.shape) print(df_feat.head()) print(df_feat.columns.tolist()) return df_feat
def feat_extract(df): tr = loader.load_df('../input/tr.ftr') te = loader.load_df('../input/te.ftr') df_feat = pd.concat([tr, te]) df_feat = df_feat[['session_id', 'impressions', 'step']].drop_duplicates() actions = ['interaction item image', 'interaction item info', \ 'interaction item deals', 'interaction item rating', \ 'search for item', 'clickout item'] df = df[df.action_type.isin(actions)] df = df[~pd.isnull(df.reference)] df = df[['session_id', 'reference', 'step']] df.columns = ID_NAMES + ['step'] df['impressions'] = df['impressions'].astype('int') df = df.merge(df_feat[['session_id', 'step']].drop_duplicates(), \ on='session_id', how='left') # 过滤掉最后一次 clk 样本,并保留当前 item 的最后一次 act df = df[df.step_x < df.step_y] df.drop_duplicates(subset=ID_NAMES, keep='last', inplace=True) del df_feat['step'] df_feat = df_feat.merge(df, on=ID_NAMES, how='left') print('filter', df_feat.shape) df_feat['nearest_step'] = df_feat['step_x'] df_feat['nearest_step_delta'] = df_feat['step_y'] - df_feat['step_x'] print(df_feat.head()) print(df_feat[['nearest_step', 'nearest_step_delta']].describe()) df_feat = df_feat[ID_NAMES + ['nearest_step', 'nearest_step_delta']] print(df_feat.shape) print(df_feat.head()) print(df_feat.columns.tolist()) return df_feat
def merge_val(file_path, sub_name, fold_num): file_list = os.listdir(file_path) paths = ['{}_cv_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)] print (paths) dfs = [] for path in paths: assert path in file_list, '{} not exist'.format(path) path = '{}/{}'.format(file_path, path) dfs.append(loader.load_df(path)) df = pd.concat(dfs) print (df.head()) print (df.describe()) out_path = '{}/{}_cv.ftr'.format(file_path, sub_name) loader.save_df(df, out_path)