Exemplo n.º 1
0
def gen_samples(paper, tr_desc_path, tr_recall_path):
    tr_desc = loader.load_df(tr_desc_path)
    tr = loader.load_df(tr_recall_path)
    #     tr = tr.head(1000)
    print(tr.head())

    tr_feat = tr[ID_NAMES].merge(paper, on=['paper_id'], how='left')
    tr_feat = tr_feat.merge(tr_desc[['description_id', 'quer_key',
                                     'quer_all']],
                            on=['description_id'],
                            how='left')

    tr_feat = multi_process_feat(tr_feat)
    del_cols = [
        col for col in tr_feat.columns
        if 'keyw' not in col and col not in ID_NAMES
    ]
    tr_feat.drop(del_cols, axis=1, inplace=True)
    print(tr_feat.head())

    tr = tr.merge(tr_feat, on=ID_NAMES, how='left')
    print(tr.head())
    print(len(tr.columns), tr.columns.tolist())

    del_cols = [
        col for col in tr.columns
        if tr[col].dtype == 'O' and col not in ID_NAMES
    ]
    print('tr del cols', del_cols)
    return tr.drop(del_cols, axis=1)
Exemplo n.º 2
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_sample = pd.concat([tr, te])

    actions = ['interaction item image', 'interaction item info', \
            'interaction item deals', 'interaction item rating', \
            'search for item', 'clickout item']
    df = df[df.action_type.isin(actions)]
    df = df[~pd.isnull(df.reference)]
    df = df[['session_id', 'reference', 'step']]
    df.columns = ID_NAMES + ['step']
    df['impressions'] = df['impressions'].astype('int')
    df = df.merge(df_sample[['session_id', 'step']].drop_duplicates(), \
            on='session_id', how='left')
    # 过滤掉最后一次 clk 样本
    df = df[df.step_x < df.step_y]

    df_feat = df_sample[ID_NAMES] \
            .merge(df[ID_NAMES + ['step_x']].drop_duplicates(subset=ID_NAMES), \
            on=ID_NAMES, how='left')

    df_feat['actived'] = df_feat['step_x'].fillna(0)
    df_feat['actived'] = list(df_feat['actived'].apply(lambda x: min(x, 1)))
    print(df_feat['actived'].describe())
    df_feat['impressions_active_ratio'] = \
            df_feat.groupby('impressions')['actived'].transform('mean')
    df_feat = df_feat[ID_NAMES + ['impressions_active_ratio']]

    print(df_feat.shape)
    print(df_feat.head())
    print(df_feat.columns.tolist())

    return df_feat
Exemplo n.º 3
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_sample = pd.concat([tr, te])
    df_feat = df_sample[ID_NAMES].drop_duplicates()

    actions = ['interaction item image', 'interaction item info', \
            'interaction item deals', 'interaction item rating', \
            'search for item', 'clickout item']
    df = df[df.action_type.isin(actions)]
    df = df[~pd.isnull(df.reference)]
    df = df[['session_id', 'reference', 'step']]
    df.columns = ID_NAMES + ['step']
    df['impressions'] = df['impressions'].astype('int')
    df = df.merge(df_sample[['session_id', 'step']].drop_duplicates(), \
            on='session_id', how='left')
    # 过滤掉最后一次 clk 样本
    df = df[df.step_x < df.step_y]
    df = df[ID_NAMES].drop_duplicates(subset=['session_id'], keep='last')
    df.columns = ['session_id', 'act_item_pre_1']
    df = df_feat.merge(df, on='session_id', how='left')

    df_sim = loader.load_df('../../../feat/m1_similarity_all.ftr')
    df_sim.columns = ['impressions', 'act_item_pre_1'] + \
            ['act_item_pre_1-{}'.format(c) for c in df_sim.columns.tolist()[2:]]
    df_feat = df.merge(df_sim, \
            on=['impressions', 'act_item_pre_1'], how='left')

    print(df_feat.shape)
    print(df_feat.head())
    print(df_feat.columns.tolist())

    return df_feat
Exemplo n.º 4
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_feat = pd.concat([tr, te])
    df_feat = df_feat[['session_id', 'impressions', 'step']].drop_duplicates()

    actions = ['interaction item image', 'interaction item info', \
            'interaction item deals', 'interaction item rating', \
            'search for item', 'clickout item']
    df = df[df.action_type.isin(actions)]
    df = df[~pd.isnull(df.reference)]
    df = df[['session_id', 'reference', 'step', 'action_type']]
    df.columns = ID_NAMES + ['step', 'action_type']
    df['impressions'] = df['impressions'].astype('int')
    df = df.merge(df_feat[['session_id', 'step']].drop_duplicates(), \
            on='session_id', how='left')
    # 过滤掉最后一次 clk 样本
    df = df[df.step_x < df.step_y]

    df_feat = custom_cate_encoding.gen_hist_feat(df, \
            ID_NAMES, 'action_type', ratio=False)
    print(df_feat.head())
    print(df_feat.columns.tolist())
    df_feat.columns = ID_NAMES + ['hist_{}' \
            .format(val) for val in df_feat.columns.tolist()[2:]]

    print(df_feat.shape)
    print(df_feat.head())
    print(df_feat.columns.tolist())

    return df_feat
Exemplo n.º 5
0
def gen_fea(base_tr_path=None, base_te_path=None):

    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')

    #tr = loader.load_df('../input/tr.ftr')
    #te = loader.load_df('../input/te.ftr')

    #tr = loader.load_df('../feature/tr_s0_9.ftr')
    #te = loader.load_df('../feature/te_s0_9.ftr')

    #tr = loader.load_df('../feature/tr_fea_s0_1.ftr')
    #te = loader.load_df('../feature/te_fea_s0_1.ftr')

    #tr = tr.head(1000)
    #te = te.head(1000)

    df_base = pd.concat([tr, te])
    df_feat = feat_extract(df_base)

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')

    merge_keys = ['session_id', 'impressions']
    #merge_keys = ['session_id']
    tr = tr_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left')
    te = te_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left')

    print(tr.shape, te.shape)
    print(tr.head())
    print(te.head())
    print(tr.columns)

    output_fea(tr, te)
def gen_fea(base_tr_path=None, base_te_path=None):

    tr_sample = loader.load_df('../../feat/tr_s0_37.ftr')
    te_sample = loader.load_df('../../feat/te_s0_37.ftr')

    prefixs = ['m1_cat_03', 'm1_infesent_simple', 'm1_nn_02', \
               'm2_ESIM_001', 'm2_ESIMplus_001', 'lgb_m3_37-0']

    tr_paths = ['{}_tr.ftr'.format(prefix) for prefix in prefixs]
    te_paths = ['final_{}_te.ftr'.format(prefix) for prefix in prefixs]

    tr_paths = ['../../stk_feat/{}'.format(p) for p in tr_paths]
    te_paths = ['../../stk_feat/{}'.format(p) for p in te_paths]

    trs, tes = [], []
    for i, prefix in enumerate(prefixs):
        tr, te = feat_extract(tr_paths[i], te_paths[i], prefix + '_prob')
        trs.append(tr)
        tes.append(te)
    tr = pd.concat([tr_sample[ID_NAMES]] + trs, axis=1)
    te = pd.concat([te_sample[ID_NAMES]] + tes, axis=1)

    float_cols = [c for c in tr.columns if tr[c].dtype == 'float']
    tr[float_cols] = tr[float_cols].astype('float32')
    te[float_cols] = te[float_cols].astype('float32')

    print(tr.shape, te.shape)
    print(tr.head())
    print(te.head())
    print(tr.columns)

    output_fea(tr, te)
def feat_extract(tr_path, te_path, prefix):
    tr_sample = loader.load_df('../../feat/tr_s0_37.ftr')
    te_sample = loader.load_df('../../feat/te_s0_37.ftr')

    tr = loader.load_df(tr_path)
    te = loader.load_df(te_path)

    del_cols = ['label']
    del_cols = [col for col in tr.columns if col in del_cols]
    tr.drop(del_cols, axis=1, inplace=True)

    tr = tr_sample[ID_NAMES].merge(tr, on=ID_NAMES, how='left')
    te = te_sample[ID_NAMES].merge(te, on=ID_NAMES, how='left')

    tr.columns = ID_NAMES + [prefix]
    te.columns = ID_NAMES + [prefix]

    print(prefix)
    print(tr.shape, te.shape)
    print(tr.head())

    tr = tr[prefix]
    te = te[prefix]

    return tr, te
Exemplo n.º 8
0
def gen_fea(base_tr_path=None, base_te_path=None):

    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')
    df_base = pd.concat([tr, te])

    #df_base = filter_acts_after_last_clk(df_base)

    df_feat = feat_extract(df_base)
    loader.save_df(df_feat, '../feature/df_feat.ftr')

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')

    tr = tr_sample[ID_NAMES].merge(df_feat, on=ID_NAMES, how='left')
    te = te_sample[ID_NAMES].merge(df_feat, on=ID_NAMES, how='left')

    print (tr.shape, te.shape)
    print (tr.head())
    print (te.head())
    print (tr.columns)

    #tr = df_base[pd.notnull(df_base['target'])].reset_index(drop=True)
    #te = df_base[pd.isnull(df_base['target'])].reset_index(drop=True)

    output_fea(tr, te)
Exemplo n.º 9
0
def filter_acts_after_last_clk(df):
    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')
    df_sample = pd.concat([tr_sample, te_sample])

    df_sample = df_sample[['session_id','step']].drop_duplicates()
    df = df.merge(df_sample, on='session_id', how='left')
    print(df.head(10))
    df = df[df.step_x < df.step_y]
    return df
Exemplo n.º 10
0
def gen_fea(base_tr_path=None, base_te_path=None):

    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')

    #tr = loader.load_df('../input/tr.ftr')
    #te = loader.load_df('../input/te.ftr')

    #tr = loader.load_df('../feature/tr_s0_9.ftr')
    #te = loader.load_df('../feature/te_s0_9.ftr')

    #tr = loader.load_df('../feature/tr_fea_s0_1.ftr')
    #te = loader.load_df('../feature/te_fea_s0_1.ftr')

    #tr = tr.head(1000)
    #te = te.head(1000)

    df_base = pd.concat([tr, te])

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')

    actions = [['interaction item image', 'interaction item info', \
            'interaction item deals', 'interaction item rating', \
            'search for item'], ['clickout item']]
    prefixs = ['normal_active', 'clickout']
    merge_keys = ['session_id', 'impressions']

    trs, tes = [], []
    for i, acts in enumerate(actions):
        df_feat = feat_extract(df_base, acts, prefixs[i])
        cur_tr = tr_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left')
        cur_te = te_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left')
        trs.append(cur_tr)
        tes.append(cur_te)

    tr, te = trs[0], tes[0]
    for i in range(1, len(trs)):
        tr = pd.concat([tr, trs[i].drop(ID_NAMES, axis=1)], axis=1)
        te = pd.concat([te, tes[i].drop(ID_NAMES, axis=1)], axis=1)

    float_cols = [c for c in tr.columns if tr[c].dtype == 'float']
    tr[float_cols] = tr[float_cols].astype('float32')
    te[float_cols] = te[float_cols].astype('float32')

    print(tr.shape, te.shape)
    print(tr.head())
    print(te.head())
    print(tr.columns)

    output_fea(tr, te)
def filter_useless_data():

    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')

    tr = remove_repeated_session_in_tr(tr, te)

    tr = remove_invalid_reference(tr)
    te = remove_invalid_reference(te)

    tr = remove_acts_after_last_clk(tr, is_te=False)
    te = remove_acts_after_last_clk(te, is_te=True)

    loader.save_df(tr, '../input/train.ftr')
    loader.save_df(te, '../input/test.ftr')
def gen_fea():
    tr = loader.load_df('../../feat/tr_s0_32-50.ftr')
    te = loader.load_df('../../feat/te_s0_32-50.ftr')

    tr_feat = feat_extract(tr[ID_NAMES])
    te_feat = feat_extract(te[ID_NAMES], is_te=True)

    tr = tr[ID_NAMES].merge(tr_feat, on=['description_id'], how='left')
    te = te[ID_NAMES].merge(te_feat, on=['description_id'], how='left')

    print(tr.shape, te.shape)
    print(tr.head())
    print(te.head())
    print(tr.columns)

    output_fea(tr, te)
Exemplo n.º 13
0
def gen_sample(ori, des):
    df = loader.load_df(ori)
    print(df.shape)
    df = df[df.action_type == 'clickout item']
    print(df.shape)
    df_out = explode(df, ['impressions', 'prices'])
    print(df_out.shape)
    loader.save_df(df_out, des)
Exemplo n.º 14
0
def gen_fea(base_tr_path=None, base_te_path=None):

    #tr = loader.load_df('../input/train.ftr')
    #te = loader.load_df('../input/test.ftr')

    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')

    #tr = loader.load_df('../feature/tr_s0_9.ftr')
    #te = loader.load_df('../feature/te_s0_9.ftr')

    #tr = loader.load_df('../feature/tr_fea_s0_1.ftr')
    #te = loader.load_df('../feature/te_fea_s0_1.ftr')

    #tr = tr.head(1000)
    #te = te.head(1000)

    df_base = pd.concat([tr, te])
    df_feat = feat_extract(df_base)

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')

    #merge_keys = ['session_id', 'impressions']
    #merge_keys = ['session_id']
    merge_keys = ['impressions']
    add_keys = ['prices', 'impr_rank']
    tr = tr_sample[ID_NAMES + add_keys] \
            .merge(df_feat, on=merge_keys, how='left')
    te = te_sample[ID_NAMES + add_keys] \
            .merge(df_feat, on=merge_keys, how='left')
    add_meta_fea(tr)
    add_meta_fea(te)
    tr.drop(add_keys, axis=1, inplace=True)
    te.drop(add_keys, axis=1, inplace=True)

    float_cols = [c for c in tr.columns if tr[c].dtype == 'float']
    tr[float_cols] = tr[float_cols].astype('float32')
    te[float_cols] = te[float_cols].astype('float32')

    print(tr.shape, te.shape)
    print(tr.head())
    print(te.head())
    print(tr.columns)

    output_fea(tr, te)
Exemplo n.º 15
0
def gen_tr_feat():
    df = loader.load_df('../input/sample_train.ftr')
    df['reference'] = df['reference'].astype('int')
    df['target'] = (df['reference'] == df['impressions']).astype(int)
    df.drop(['reference','action_type'],axis=1,inplace=True)
    df_session = df[['session_id','step']].drop_duplicates(subset='session_id',keep='last').reset_index(drop=True)
    df = df_session.merge(df, on=['session_id','step'], how='left').reset_index(drop=True)
    loader.save_df(df,'../input/tr.ftr')
Exemplo n.º 16
0
def process(in_path, k):
    ID_NAMES = ['description_id', 'paper_id']

    df = loader.load_df(in_path)
    df = topk_lines(df, k)
    df['sim_score'] = df['sim_score'].astype('float')
    df.rename(columns={'sim_score': 'corp_sim_score'}, inplace=True)
    return df
Exemplo n.º 17
0
def sub_convert(df_path, pred_path, out_path1, out_path2):
    te_data = loader.load_df(df_path)
    df_pred = loader.load_df(pred_path)

    sort_df_pred = df_pred.sort_values(['description_id', 'target'],
                                       ascending=False)
    df_pred = df_pred[['description_id']].drop_duplicates() \
            .merge(sort_df_pred, on=['description_id'], how='left')
    df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
    df_pred = df_pred[df_pred['rank'] < 3]
    df_pred = df_pred.groupby(['description_id'])['paper_id'] \
            .apply(lambda s : ','.join((s))).reset_index()

    df_pred = te_data[['description_id']].merge(df_pred,
                                                on=['description_id'],
                                                how='left')
    loader.save_df(df_pred, out_path1)
Exemplo n.º 18
0
def sub_convert(df_path, pred_path, out_path):
    df_data = loader.load_df(df_path)
    df_pred = loader.load_df(pred_path)

    required_cols = [
        'user_id', 'session_id', 'timestamp', 'step', 'impressions'
    ]
    df_sub = df_data[required_cols]
    df_sub['target'] = df_pred['target']
    df_sub = df_sub.sort_values(by=['session_id', 'target'], \
            ascending=False).reset_index(drop=True)

    df_sub['item_recommendations'] = df_sub['impressions'].astype(str)
    df_sub = df_sub.groupby(['user_id','session_id','timestamp','step']) \
                ['item_recommendations'].apply(lambda lst : ' '.join((lst))).reset_index()

    df_sub = df_sub.sort_values(by='session_id').reset_index(drop=True)
    df_sub.to_csv(out_path, float_format='%.4f', index=False)
Exemplo n.º 19
0
def cv_convert(pred_path, out_path):
    df_pred = loader.load_df(pred_path)

    df_pred = df_pred.sort_values(['session_id', 'target'], ascending=False)
    df_pred['rank'] = df_pred.groupby('session_id').cumcount()

    df_pred = df_pred.sort_values(by=['session_id', 'rank']).reset_index(
        drop=True)
    df_pred.to_csv(out_path, float_format='%.4f', index=False)
Exemplo n.º 20
0
def gen_fea(base_tr_path=None, base_te_path=None):

    #tr = loader.load_df('../input/train.ftr')
    #te = loader.load_df('../input/test.ftr')

    #tr = loader.load_df('../input/tr.ftr')
    #te = loader.load_df('../input/te.ftr')

    #tr = loader.load_df('../feature/tr_s0_0.ftr')
    #te = loader.load_df('../feature/te_s0_0.ftr')

    #tr = loader.load_df('../feature/tr_fea_s0_1.ftr')
    #te = loader.load_df('../feature/te_fea_s0_1.ftr')

    #tr = tr.head(1000)
    #te = te.head(1000)

    #df_base = pd.concat([tr, te])
    df_base = loader.load_df('../input/item_metadata.ftr')
    #df_base = df_base.head(1000)
    df_feat = feat_extract(df_base)

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')

    merge_keys = ['session_id', 'impressions']
    #merge_keys = ['session_id']
    #merge_keys = ['impressions']

    tr = tr_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left')
    te = te_sample[ID_NAMES].merge(df_feat, on=merge_keys, how='left')

    float_cols = [c for c in tr.columns if tr[c].dtype == 'float']
    tr[float_cols] = tr[float_cols].astype('float32')
    te[float_cols] = te[float_cols].astype('float32')

    print(tr.shape, te.shape)
    print(tr.head())
    print(te.head())
    print(tr.columns)

    output_fea(tr, te)
Exemplo n.º 21
0
def merge_sub(file_path, sub_name, fold_num):
    file_list = os.listdir(file_path)

    paths = ['{}_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)]
    print (paths)

    df = pd.DataFrame()
    for i, path in enumerate(paths):
        assert path in file_list, '{} not exist'.format(path)
        path = '{}/{}'.format(file_path, path)
        if i == 0:
            df = loader.load_df(path)
        else:
            df[TARGET_NAME] += loader.load_df(path)[TARGET_NAME]

    df[TARGET_NAME] /= fold_num
    print (df.head())
    print (df.describe())
    out_path = '{}/{}.ftr'.format(file_path, sub_name)
    loader.save_df(df, out_path)
Exemplo n.º 22
0
def gen_samples(paper, tr_desc_path, tr_recall_path):
    tr_desc = loader.load_df(tr_desc_path)
    tr = loader.load_df(tr_recall_path)
    tr = tr.head(1000)

    tr = tr.merge(paper, on=['paper_id'], how='left')
    tr = tr.merge(tr_desc[['description_id', 'quer_key', 'quer_all']],
                  on=['description_id'],
                  how='left')

    print(tr.columns)
    print(tr.head())

    tr = multi_process_feat(tr)
    del_cols = [
        col for col in tr.columns
        if tr[col].dtype == 'O' and col not in ID_NAMES
    ]
    print('tr del cols', del_cols)
    return tr.drop(del_cols, axis=1)
Exemplo n.º 23
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_feat = pd.concat([tr, te])
    df_feat = df_feat[['session_id', 'step']].drop_duplicates()
    df_feat['step'] -= 1

    df = df[['session_id', 'step', 'action_type', 'reference']] \
            .drop_duplicates(subset=['session_id', 'step'])
    df_feat = df_feat.merge(df, on=['session_id', 'step'], how='left')
    print (df_feat.head())
    df_feat['act_pre1'] = pd.factorize(df_feat['action_type'], sort=True)[0]

    df_feat = df_feat[['session_id', 'act_pre1']]

    print (df_feat.shape)
    print (df_feat.head())
    print (df_feat.columns.tolist())

    return df_feat
Exemplo n.º 24
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_sample = pd.concat([tr, te])

    actions = ['interaction item image', 'interaction item info', \
            'interaction item deals', 'interaction item rating', \
            'search for item', 'clickout item']
    df = df[df.action_type.isin(actions)]
    df = df[~pd.isnull(df.reference)]
    df = df[['session_id', 'reference', 'step']]
    df.columns = ID_NAMES + ['step']
    df['impressions'] = df['impressions'].astype('int')
    df = df.merge(df_sample[['session_id', 'step']].drop_duplicates(), \
            on='session_id', how='left')
    # 过滤掉最后一次 clk 样本
    df = df[df.step_x < df.step_y]

    tr = loader.load_df('../input/sample_train.ftr')
    te = loader.load_df('../input/sample_test.ftr')
    df_sample_all = pd.concat([tr, te])

    df_feat = df[ID_NAMES + ['step_x']].drop_duplicates(subset=ID_NAMES) \
            .merge(df_sample_all[ID_NAMES + ['prices']], on=ID_NAMES, how='left')

    print(df_feat.head())
    print('filter', df_feat.shape)
    sub_df = df_feat[~pd.isnull(df_feat['step_x'])]
    print('filter', sub_df.shape)

    df_feat = df_sample[['session_id']].drop_duplicates()
    df_feat = cate_encoding.cate_num_stat(sub_df, df_feat, ['session_id'], \
            'prices', ['max', 'min', 'median', 'std'])
    df_feat.columns = ['session_id'] + \
            ['active_items-{}-v2'.format(c) for c in df_feat.columns.tolist()[1:]]

    print(df_feat.shape)
    print(df_feat.head())
    print(df_feat.columns.tolist())

    return df_feat
Exemplo n.º 25
0
def gen_fea(base_tr_path=None, base_te_path=None):

    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')
    df_base = pd.concat([tr, te])

    df_feat = feat_extract(df_base)
    loader.save_df(df_feat, '../feature/df_feat.ftr')

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')

    tr = tr_sample[ID_NAMES].merge(df_feat, on='session_id', how='left')
    te = te_sample[ID_NAMES].merge(df_feat, on='session_id', how='left')

    print(tr.shape, te.shape)
    print(tr.head())
    print(te.head())
    print(tr.columns)

    output_fea(tr, te)
Exemplo n.º 26
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_sample = pd.concat([tr, te])
    df_sample['impr_rank'] = df_sample.groupby(['session_id'
                                                ]).cumcount().values

    actions = ['interaction item image', 'interaction item info', \
            'interaction item deals', 'interaction item rating', \
            'search for item', 'clickout item']
    df = df[df.action_type.isin(actions)]
    df = df[~pd.isnull(df.reference)]
    df = df[['session_id', 'reference', 'step']]
    df.columns = ID_NAMES + ['step']
    df['impressions'] = df['impressions'].astype('int')
    df = df.merge(df_sample[['session_id', 'step']].drop_duplicates(), \
            on='session_id', how='left')
    # 过滤掉最后一次 clk 样本
    df = df[df.step_x < df.step_y]
    df = df.drop_duplicates(subset=['session_id'], keep='last')

    df_feat = df_sample[ID_NAMES + ['impr_rank']] \
            .merge(df[ID_NAMES + ['step_x']] \
            .drop_duplicates(subset=ID_NAMES), on=ID_NAMES, how='left')

    print(df_feat.head())
    print('filter', df_feat.shape)
    sub_df = df_feat[~pd.isnull(df_feat['step_x'])]
    sub_df = sub_df[['session_id', 'impr_rank']]
    sub_df.columns = ['session_id', 'lastest_item-impr_rank']
    print('filter', sub_df.shape)

    df_feat = df_sample[['session_id']].drop_duplicates()
    df_feat = df_feat.merge(sub_df, on='session_id', how='left')

    print(df_feat.shape)
    print(df_feat.head())
    print(df_feat.columns.tolist())

    return df_feat
Exemplo n.º 27
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_sample = pd.concat([tr, te])

    df['star'] = df.properties.apply(lambda s : \
            ''.join([v for v in s.split('|') if 'Star' in v and 'Stars' not in v]))
    df['star'] = df['star'].apply(lambda s: s.split(' ')[0])
    df['star'].replace('', 0, inplace=True)
    print(df['star'].value_counts())
    df_star = df[['item_id', 'star']].drop_duplicates(subset=['item_id'])
    df_star.columns = ['impressions', 'star']
    df_star['star'] = df_star['star'].astype('int')

    df_feat = df_sample[ID_NAMES + ['prices']].drop_duplicates(subset=ID_NAMES) \
            .merge(df_star, on='impressions', how='left')

    df_feat = cate_encoding.cate_num_stat(df_feat, df_feat, \
            ['session_id'], 'star', ['max', 'median', 'std'])

    df_feat['star_sub_session_max'] = \
            df_feat['star'] - df_feat['session_id_by_star_max']
    df_feat['star_sub_session_median'] = \
            df_feat['star'] - df_feat['session_id_by_star_median']

    df_feat['star_div_prices'] = df_feat['star'] * 100 / df_feat['prices']
    print(df_feat['star_div_prices'].describe())

    df_feat = cate_encoding.cate_num_rank(df_feat, \
            ['session_id'], 'star_div_prices', ascending=False)
    del df_feat['prices']

    print('df_feat info')
    print(df_feat.shape)
    print(df_feat.head())
    print(df_feat.columns.tolist())

    return df_feat
Exemplo n.º 28
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_sample = pd.concat([tr, te])

    df['ts_prev'] = df.groupby('session_id')['timestamp'].shift(1)
    df['ts_sub_prev'] = df['timestamp'] - df['ts_prev']
    df = df[['session_id', 'step', 'ts_sub_prev']]
    df_feat = df_sample[['session_id', 'step']].drop_duplicates() \
            .merge(df, on=['session_id', 'step'], how='left')
    df_feat = df_feat[['session_id', 'ts_sub_prev']]
    df_feat = df_feat.drop_duplicates(subset=['session_id'])

    print(df_feat['ts_sub_prev'].describe())
    print(df_feat[df_feat['ts_sub_prev'] < 0].shape)
    #df_feat['ts_sub_prev'] = df_feat['ts_sub_prev'].apply(lambda x : max(x, 0))
    #print (df_feat['ts_sub_prev'].describe())

    print(df_feat.shape)
    print(df_feat.head())
    print(df_feat.columns.tolist())

    return df_feat
Exemplo n.º 29
0
def feat_extract(df):
    tr = loader.load_df('../input/tr.ftr')
    te = loader.load_df('../input/te.ftr')
    df_feat = pd.concat([tr, te])
    df_feat = df_feat[['session_id', 'impressions', 'step']].drop_duplicates()

    actions = ['interaction item image', 'interaction item info', \
            'interaction item deals', 'interaction item rating', \
            'search for item', 'clickout item']
    df = df[df.action_type.isin(actions)]
    df = df[~pd.isnull(df.reference)]
    df = df[['session_id', 'reference', 'step']]
    df.columns = ID_NAMES + ['step']
    df['impressions'] = df['impressions'].astype('int')
    df = df.merge(df_feat[['session_id', 'step']].drop_duplicates(), \
            on='session_id', how='left')
    # 过滤掉最后一次 clk 样本,并保留当前 item 的最后一次 act
    df = df[df.step_x < df.step_y]
    df.drop_duplicates(subset=ID_NAMES, keep='last', inplace=True)

    del df_feat['step']
    df_feat = df_feat.merge(df, on=ID_NAMES, how='left')
    print('filter', df_feat.shape)

    df_feat['nearest_step'] = df_feat['step_x']
    df_feat['nearest_step_delta'] = df_feat['step_y'] - df_feat['step_x']

    print(df_feat.head())
    print(df_feat[['nearest_step', 'nearest_step_delta']].describe())
    df_feat = df_feat[ID_NAMES + ['nearest_step', 'nearest_step_delta']]

    print(df_feat.shape)
    print(df_feat.head())
    print(df_feat.columns.tolist())

    return df_feat
Exemplo n.º 30
0
def merge_val(file_path, sub_name, fold_num):
    file_list = os.listdir(file_path)

    paths = ['{}_cv_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)]
    print (paths)

    dfs = []
    for path in paths:
        assert path in file_list, '{} not exist'.format(path)
        path = '{}/{}'.format(file_path, path)
        dfs.append(loader.load_df(path))

    df = pd.concat(dfs)
    print (df.head())
    print (df.describe())
    out_path = '{}/{}_cv.ftr'.format(file_path, sub_name)
    loader.save_df(df, out_path)