Пример #1
0
                all_phase_click_no_qtime['user_id'])]
        print('load features, shape:{}'.format(feature_df.shape))
    else:
        feature_df = do_featuring(
            all_phase_click_no_qtime,
            sample_df,
            hot_df,
            conf.process_num,
            item_txt_embedding_dim,
            is_recall=False,
            feature_caching_path=conf.features_cache_path)

    assert sample_df.shape[0] == feature_df.shape[0]
    assert len(set(sample_df['user_id'])) == len(set(feature_df['user_id']))
    ''' 训练集/验证集划分 '''
    train_df, valid_df = train_test_split(feature_df)
    train_x = train_df[train_df.columns.difference(
        ['user_id', 'item_id', 'label'])].values
    train_y = train_df['label'].values

    valid_x = valid_df[valid_df.columns.difference(
        ['user_id', 'item_id', 'label'])].values
    valid_y = valid_df['label'].values
    ''' 模型训练 '''
    time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('------------------------ 模型训练 start time:{}'.format(time_str))
    # submit = train_model_lgb(feature_all, recall_rate=hit_rate, hot_list=hot_list, valid=0.2, topk=50, num_boost_round=1, early_stopping_rounds=1)
    # submit = train_model_rf(train_test, recall_rate=1, hot_list=hot_list, valid=0.2, topk=50)
    model = rank_rf(train_x, train_y)
    # model = rank_xgb(train_x, train_y)
    print('train set: auc:{}'.format(
Пример #2
0
def _get_model(params):
    feature_df = params.get("feature")
    hot_df = params.get("hot_df")

    eta = None
    if 'eta' in params:
        eta = params['eta']
    min_child_weight = None
    if 'min_child_weight' in params:
        min_child_weight = params['min_child_weight']
    max_depth = None
    if 'max_depth' in params:
        max_depth = int(params['max_depth'])
    gamma = None
    if 'gamma' in params:
        gamma = params['gamma']
    subsample = None
    if 'subsample' in params:
        subsample = params['subsample']
    colsample_bytree = None
    if 'colsample_bytree' in params:
        colsample_bytree = params['colsample_bytree']
    reg_lambda = None
    if 'reg_lambda' in params:
        reg_lambda = params['reg_lambda']
    scale_pos_weight = None
    if 'scale_pos_weight' in params:
        scale_pos_weight = params['scale_pos_weight']
    tree_method = None
    if 'tree_method' in params:
        tree_method = params['tree_method']
    n_estimators = None
    if 'n_estimators' in params:
        n_estimators = int(params['n_estimators'])

    train_auc = valid_auc = 0
    pre_score_arr = np.zeros(5).reshape(-1, )
    rank_score_arr = np.zeros(5).reshape(-1, )
    for i in range(conf.k):
        ''' 训练集/验证集划分 '''
        train_df, valid_df = featuring.train_test_split(feature_df, seed=1)

        train_x = train_df[train_df.columns.difference(
            ['user_id', 'item_id', 'label'])].values
        train_y = train_df['label'].values

        valid_df = valid_df.sort_values('sim').reset_index(drop=True)
        valid_x = valid_df[valid_df.columns.difference(
            ['user_id', 'item_id', 'label'])].values
        valid_y = valid_df['label'].values
        ''' 模型训练 '''
        model = rank.rank_xgb(train_x,
                              train_y,
                              eta=eta,
                              min_child_weight=min_child_weight,
                              max_depth=max_depth,
                              gamma=gamma,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              reg_lambda=reg_lambda,
                              scale_pos_weight=scale_pos_weight,
                              tree_method=tree_method,
                              n_estimators=n_estimators)
        one_train_auc = roc_auc_score(train_y,
                                      model.predict_proba(train_x)[:, 1])
        train_auc += one_train_auc
        ''' 模型验证 '''
        pre_y = model.predict_proba(valid_x)[:, 1]
        one_valid_auc = roc_auc_score(valid_y, pre_y)
        valid_auc += one_valid_auc
        answer = eval.make_answer(valid_df[valid_df['label'] == 1],
                                  hot_df,
                                  phase=1)

        pre_score_arr += eval.my_eval(list(valid_df['sim']),
                                      valid_df,
                                      answer,
                                      print_mark=False)
        rank_score_arr += eval.my_eval(pre_y,
                                       valid_df,
                                       answer,
                                       print_mark=False)

    avg_valid_auc = valid_auc / conf.k
    avg_pre_ndcg = pre_score_arr / conf.k
    avg_rank_ndcg = rank_score_arr / conf.k
    diff = avg_rank_ndcg - avg_pre_ndcg
    print('avg valid auc:{}, ndcg full gain:{}, ndcg half gain:{}'.format(
        avg_valid_auc, diff[0], diff[2]))

    return -diff[2]
Пример #3
0
    total_feature_df = utils.get_features(total_feature_df, is_label=1, type=0)
    print('feature shape:{}, positive feature num:{}'.format(
        total_feature_df.shape,
        total_feature_df[total_feature_df['label'] == 1].shape[0]))
    # 这里的hot_df与训练集不是同步的,暂时凑合着用
    hot_df = all_phase_click_no_qtime.groupby(
        'item_id')['user_id'].count().reset_index()
    hot_df.columns = ['item_id', 'item_deg']
    hot_df = hot_df.sort_values('item_deg',
                                ascending=False).reset_index(drop=True)
    train_auc = valid_auc = 0
    pre_score_arr = np.zeros(5).reshape(-1, )
    rank_score_arr = np.zeros(5).reshape(-1, )
    for i in range(conf.k):
        ''' 训练集/验证集划分 '''
        train_df, valid_df = train_test_split(total_feature_df)

        qtime_user_df = all_phase_click[all_phase_click['train_or_test'] ==
                                        'predict']
        print('训练集命中{}个qtime的user.'.format(
            len(
                set(train_df[train_df['user_id'].isin(
                    qtime_user_df['user_id'])]['user_id']))))

        train_x = train_df[train_df.columns.difference(
            ['user_id', 'item_id', 'label', 'truth_item_id'])].values
        train_y = train_df['label'].values

        valid_df = valid_df.sort_values('sim').reset_index(drop=True)
        valid_x = valid_df[valid_df.columns.difference(
            ['user_id', 'item_id', 'label', 'truth_item_id'])].values
Пример #4
0
def do_featuring(click_df, item_info_df, user_info_df, user_item_dict,
                 train_user_recall_df, test_user_recall_df, sim_matrix,
                 hot_df):
    """

    :param click_df:
    :param item_info_df:
    :param user_info_df:
    :return:
    """
    ''' 集合划分 '''
    # 训练集 测试集
    # 负样本采样:从所有点击历史中采样非正样本item
    # TODO 从官方给的item表中采样、从点击+item表中采样
    # TODO  many todo in sampling_negtive_samples
    # todo 只用train去负采样,后面尝试train和test一起
    if is_data_set_cached:
        print('reading train/valid ... set')
        train_data = pd.read_csv(
            './cache/features_cache/train_data_{}.csv'.format(phase),
            dtype={
                'user_id': np.str,
                'item_id': np.str,
                'label': np.int
            })
        valid_data = pd.read_csv(
            './cache/features_cache/valid_data_{}.csv'.format(phase),
            dtype={
                'user_id': np.str,
                'item_id': np.str,
                'label': np.int
            })
        train_user_recall_df = pd.read_csv(
            './cache/features_cache/train_user_recall_{}.csv'.format(phase),
            dtype={
                'user_id': np.str,
                'item_id': np.str,
                'label': np.int,
                'itemcf_score': np.float
            })
        test_user_recall_df = pd.read_csv(
            './cache/features_cache/test_user_recall_{}.csv'.format(phase),
            dtype={
                'user_id': np.str,
                'item_id': np.str,
                'itemcf_score': np.float
            })
        print(train_data.shape)
        print(valid_data.shape)
        print(train_user_recall_df.shape)
        print(test_user_recall_df.shape)
    else:
        train_test_df = click_df[click_df['train_or_test'] == 'train'][[
            'user_id', 'item_id'
        ]]
        user_set = set(train_test_df['user_id'])
        negtive_features = utils.sampling_negtive_samples(user_set,
                                                          train_test_df,
                                                          sample_num=10)
        # features = features.merge(user_features, on='user_id', how='left')
        # features = features.merge(item_info_df, on='item_id', how='left')
        negtive_features['label'] = 0

        # time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        # print('正样本特征 start time:{}'.format(time_str))
        # 正样本加入
        positive_features = pd.DataFrame()
        positive_features['user_id'] = list(user_item_dict.keys())
        positive_features['item_id'] = list(user_item_dict.values())
        # positive_features = positive_features.merge(user_features, on='user_id', how='left')
        # positive_features = positive_features.merge(item_info_df, on='item_id', how='left')
        positive_features['label'] = 1
        # positive_features['train_or_test'] = 'train'

        # 正负样本合并
        features = negtive_features.append(positive_features).reset_index(
            drop=True)
        # features.sort_values(by='user_id', inplace=True)
        # features.reset_index(drop=True, inplace=True)

        # time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        # print('训练集验证集划分 start time:{}'.format(time_str))
        # TODO  many todo in train_test_split
        train_data, valid_data = train_test_split(features, 0.8)

        # 训练集召回结果,此部分用于验证上述训练集训练出来的模型
        train_user_recall_df = train_user_recall_df[[
            'user_id', 'item_id', 'itemcf_score'
        ]]
        train_user_recall_df['label'] = 0
        train_user_recall_df.loc[
            train_user_recall_df['user_id'].isin(list(user_item_dict.keys()))
            & train_user_recall_df['item_id'].
            isin(list(user_item_dict.values())), 'label'] = 1

        # 测试集召回结果,此部分用于提交
        test_user_recall_df = test_user_recall_df[[
            'user_id', 'item_id', 'itemcf_score'
        ]]

        # if is_data_set_caching:
        #     print('caching splited data.',
        #           train_data.shape, valid_data.shape, train_user_recall_df.shape, test_user_recall_df.shape)
        #     train_data.to_csv('./cache/features_cache/train_data_{}.csv'.format(phase), index=False)
        #     valid_data.to_csv('./cache/features_cache/valid_data_{}.csv'.format(phase), index=False)
        #     train_user_recall_df.to_csv('./cache/features_cache/train_user_recall_{}.csv'.format(phase), index=False)
        #     test_user_recall_df.to_csv('./cache/features_cache/test_user_recall_{}.csv'.format(phase), index=False)

    print(np.sum(train_data['user_id'].isin(click_df['user_id'])), ',',
          np.sum(click_df['user_id'].isin(train_data['user_id'])))
    '''
    itemCF相似度:
    '''
    time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('itemCF相似度特征 start time:{}'.format(time_str))
    train_data['itemcf_score'] = np.nan
    train_data.loc[:, 'itemcf_score'] = train_data.apply(
        lambda x: sim_matrix[x['user_id']][x['item_id']]
        if sim_matrix.get(x['user_id']) is not None and sim_matrix.get(x[
            'user_id']).get(x['item_id']) is not None else np.nan,
        axis=1)
    print(train_data)

    valid_data['itemcf_score'] = np.nan
    valid_data.loc[:, 'itemcf_score'] = valid_data.apply(
        lambda x: sim_matrix[x['user_id']][x['item_id']]
        if sim_matrix.get(x['user_id']) is not None and sim_matrix.get(x[
            'user_id']).get(x['item_id']) is not None else np.nan,
        axis=1)

    # 把负数统一洗成0 TODO 带来的问题:可能非常稀疏
    # train_user_recall_df.loc[train_user_recall_df['itemcf_score'] < 0, 'itemcf_score'] = 0
    # test_user_recall_df.loc[test_user_recall_df['itemcf_score'] < 0, 'itemcf_score'] = 0
    '''
    官方特征:
    1. user和item之间txt相似度
    2. user和item之间img相似度
    '''
    time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('官方特征 start time:{}'.format(time_str))
    # 注意,此处click_df中user_id必须包含上述四个集合user
    total_set_user = set(train_data['user_id']).union(
        set(valid_data['user_id'])).union(set(
            train_user_recall_df['user_id'])).union(
                set(test_user_recall_df['user_id']))
    assert (0 == len(set(click_df['user_id']).difference(total_set_user))
            and 0 == len(total_set_user.difference(set(click_df['user_id']))))
    user_features = get_user_features(click_df, item_info_df,
                                      item_txt_embedding_dim,
                                      item_img_embedding_dim)
    user_features_dict = transfer_user_features_df2dict(
        user_features, item_txt_embedding_dim)
    item_features_dict = transfer_item_features_df2dict(
        item_info_df, item_txt_embedding_dim)

    assert item_txt_embedding_dim == item_img_embedding_dim
    # 每计算好一个数据集就缓存下来
    train_data = cal_txt_img_sim(train_data, user_features_dict,
                                 item_features_dict, item_img_embedding_dim,
                                 process_num)
    if is_caching_features:
        print('正在缓存train_data')
        train_data.to_csv(
            './cache/features_cache/part0_train_features_phase_{}.csv'.format(
                phase),
            index=False)
    print(train_data)

    valid_data = cal_txt_img_sim(valid_data, user_features_dict,
                                 item_features_dict, item_img_embedding_dim,
                                 process_num)
    if is_caching_features:
        print('正在缓存valid_data')
        valid_data.to_csv(
            './cache/features_cache/part0_valid_features_phase_{}.csv'.format(
                phase),
            index=False)

    if is_open_train_recall:
        train_user_recall_df = cal_txt_img_sim(train_user_recall_df,
                                               user_features_dict,
                                               item_features_dict,
                                               item_img_embedding_dim,
                                               process_num)
        if is_caching_features:
            print('正在缓存train_user_recall_df')
            train_user_recall_df.to_csv(
                './cache/features_cache/part0_train_user_recall_features_phase_{}.csv'
                .format(phase),
                index=False)

    test_user_recall_df = cal_txt_img_sim(test_user_recall_df,
                                          user_features_dict,
                                          item_features_dict,
                                          item_img_embedding_dim, process_num)
    if is_caching_features:
        print('正在缓存test_user_recall_df')
        test_user_recall_df.to_csv(
            './cache/features_cache/part0_test_user_recall_features_phase_{}.csv'
            .format(phase),
            index=False)
    '''
    点击序:
    1. 纯item序列  -- 砍掉
    2. item序列和对应user  -- 砍掉
    3. 纯user序列  -- 砍掉
    4. user序列和共同item  -- 砍掉
    5. 2 带来的user和item相似度
    6. 4 带来的user和item相似度
    '''
    time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('点击序embedding特征 start time:{}'.format(time_str))
    dict_embedding_all_ui_item, dict_embedding_all_ui_user = click_embedding(
        click_df, item_img_embedding_dim)
    train_data = cal_click_sim(train_data, dict_embedding_all_ui_item,
                               dict_embedding_all_ui_user, process_num)
    if is_caching_features:
        print('正在缓存train_data')
        train_data.to_csv(
            './cache/features_cache/part0_train_features_phase_{}.csv'.format(
                phase),
            index=False)

    valid_data = cal_click_sim(valid_data, dict_embedding_all_ui_item,
                               dict_embedding_all_ui_user, process_num)
    if is_caching_features:
        print('正在缓存valid_data')
        valid_data.to_csv(
            './cache/features_cache/part0_valid_features_phase_{}.csv'.format(
                phase),
            index=False)

    if is_open_train_recall:
        train_user_recall_df = cal_click_sim(train_user_recall_df,
                                             dict_embedding_all_ui_item,
                                             dict_embedding_all_ui_user,
                                             process_num)
        if is_caching_features:
            print('正在缓存train_user_recall_df')
            train_user_recall_df.to_csv(
                './cache/features_cache/part0_train_user_recall_features_phase_{}.csv'
                .format(phase),
                index=False)

    test_user_recall_df = cal_click_sim(test_user_recall_df,
                                        dict_embedding_all_ui_item,
                                        dict_embedding_all_ui_user,
                                        process_num)
    if is_caching_features:
        print('正在缓存test_user_recall_df')
        test_user_recall_df.to_csv(
            './cache/features_cache/part0_test_user_recall_features_phase_{}.csv'
            .format(phase),
            index=False)
    print(train_data.columns)
    print(train_data.iloc[:5, :])
    print(valid_data.iloc[:5, :])
    print(train_user_recall_df.iloc[:5, :])
    print(test_user_recall_df.iloc[:5, :])

    # '''
    # 统计特征:
    # 一阶特征:
    #     user点击序中user点击次数(即 点击深度 TODO 去做个统计:点击深度和冷门物品偏好的关系) -- 全量数据集统计
    #     user点击序中item平均热度、最大热度、最小热度 -- 先不分train和test即使用全量数据集统计,调优的时候再分
    #     user平均点击间隔、最大点击间隔、最小点击间隔 -- 需要分train和test两个集合统计
    #     本item在全局的热度:先使用全量数据集统计,调优的时候分在train、test、item-feature中的热度
    # 二阶特征(样本中user和item交互):
    #     样本中user和item的距离--如果item在user点击序中则根据时间排序当做距离,否则设为最大距离(最近一个点击距离0)
    #     ? 用户热度--用户点击序中所有item热度和
    # '''
    # time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    # print('统计特征 start time:{}'.format(time_str))
    #
    # click_df = click_df.sort_values(['user_id', 'time'], ascending=False).reset_index(drop=True)
    #
    # ''' user点击序中user点击次数(即 点击深度 TODO 去做个统计:点击深度和冷门物品偏好的关系) -- 全量数据集统计 '''
    # user_click_num_df = click_df.groupby('user_id')['item_id'].count().reset_index()
    # user_click_num_df.columns = ['user_id', 'user_click_num']
    # user_click_dict = utils.two_columns_df2dict(user_click_num_df)
    #
    # train_data['user_click_num'] = train_data.apply(
    #     lambda x: user_click_dict[x['user_id']] if user_click_dict.get(x['user_id']) else 0, axis=1)
    # if is_caching_features:
    #     print('用户点击次数特征--正在缓存train_data')
    #     train_data.to_csv('./cache/features_cache/part2_train_features_phase_{}.csv'.format(phase), index=False)
    #
    # valid_data['user_click_num'] = valid_data.apply(
    #     lambda x: user_click_dict[x['user_id']] if user_click_dict.get(x['user_id']) else 0, axis=1)
    # if is_caching_features:
    #     print('用户点击次数特征--正在缓存valid_data')
    #     valid_data.to_csv('./cache/features_cache/part2_valid_features_phase_{}.csv'.format(phase), index=False)
    #
    # if is_open_train_recall:
    #     train_user_recall_df['user_click_num'] = train_user_recall_df.apply(
    #         lambda x: user_click_dict[x['user_id']] if user_click_dict.get(x['user_id']) else 0, axis=1)
    #     if is_caching_features:
    #         print('用户点击次数特征--正在缓存train_user_recall_df')
    #         train_user_recall_df.to_csv(
    #             './cache/features_cache/part2_train_user_recall_features_phase_{}.csv'.format(phase), index=False)
    #
    # test_user_recall_df['user_click_num'] = test_user_recall_df.apply(
    #     lambda x: user_click_dict[x['user_id']] if user_click_dict.get(x['user_id']) else 0, axis=1)
    # if is_caching_features:
    #     print('用户点击次数特征--正在缓存test_user_recall_df')
    #     test_user_recall_df.to_csv(
    #         './cache/features_cache/part2_test_user_recall_features_phase_{}.csv'.format(phase), index=False)
    #
    #
    # ''' 本item在全局的热度:先使用全量数据集统计,调优的时候分在train、test、item-feature中的热度 '''
    # print('item在全局的热度 doing')
    # train_data = train_data.merge(hot_df, on='item_id', how='left')
    # valid_data = valid_data.merge(hot_df, on='item_id', how='left')
    # if is_open_train_recall:
    #     train_user_recall_df = train_user_recall_df.merge(hot_df, on='item_id', how='left')
    # test_user_recall_df = test_user_recall_df.merge(hot_df, on='item_id', how='left')
    #
    # ''' user点击序中item平均热度、最大热度、最小热度 -- 先不分train和test即使用全量数据集统计,调优的时候再分 '''
    # click_df = click_df.merge(hot_df, on='item_id', how='left')
    # user_item_hot_df = \
    #     click_df.groupby('user_id').agg({'item_deg': lambda x: ','.join([str(i) for i in list(x)])}).reset_index()
    # user_item_hot_df.columns = ['user_id', 'item_hot_arr']
    # user_item_hot_df['item_hot_arr'] = user_item_hot_df.apply(
    #     lambda x: np.array(list(x['item_hot_arr'].split(',')), dtype=np.int), axis=1)
    # user_item_hot_dict = utils.two_columns_df2dict(user_item_hot_df)
    #
    # train_data['user_item_mean_deg'] = \
    #     train_data.apply(lambda x: np.nanmean(user_item_hot_dict[x['user_id']]), axis=1)
    # train_data['user_item_min_deg'] = \
    #     train_data.apply(lambda x: np.nanmin(user_item_hot_dict[x['user_id']]), axis=1)
    # train_data['user_item_max_deg'] = \
    #     train_data.apply(lambda x: np.nanmax(user_item_hot_dict[x['user_id']]), axis=1)
    # if is_caching_features:
    #     print('user点击序中item热度统计特征--正在缓存train_data')
    #     train_data.to_csv('./cache/features_cache/part2_train_features_phase_{}.csv'.format(phase), index=False)
    #
    # valid_data['user_item_mean_deg'] = \
    #     valid_data.apply(lambda x: np.nanmean(user_item_hot_dict[x['user_id']]), axis=1)
    # valid_data['user_item_min_deg'] = \
    #     valid_data.apply(lambda x: np.nanmin(user_item_hot_dict[x['user_id']]), axis=1)
    # valid_data['user_item_max_deg'] = \
    #     valid_data.apply(lambda x: np.nanmax(user_item_hot_dict[x['user_id']]), axis=1)
    # if is_caching_features:
    #     print('user点击序中item热度统计特征--正在缓存valid_data')
    #     valid_data.to_csv('./cache/features_cache/part2_valid_features_phase_{}.csv'.format(phase), index=False)
    #
    # if is_open_train_recall:
    #     train_user_recall_df['user_item_mean_deg'] = \
    #         train_user_recall_df.apply(lambda x: np.nanmean(user_item_hot_dict[x['user_id']]), axis=1)
    #     train_user_recall_df['user_item_min_deg'] = \
    #         train_user_recall_df.apply(lambda x: np.nanmin(user_item_hot_dict[x['user_id']]), axis=1)
    #     train_user_recall_df['user_item_max_deg'] = \
    #         train_user_recall_df.apply(lambda x: np.nanmax(user_item_hot_dict[x['user_id']]), axis=1)
    #     if is_caching_features:
    #         print('user点击序中item热度统计特征--正在缓存train_user_recall_df')
    #         train_user_recall_df.to_csv(
    #             './cache/features_cache/part2_train_user_recall_features_phase_{}.csv'.format(phase), index=False)
    #
    # test_user_recall_df['user_item_mean_deg'] = \
    #     test_user_recall_df.apply(lambda x: np.nanmean(user_item_hot_dict[x['user_id']]), axis=1)
    # test_user_recall_df['user_item_min_deg'] = \
    #     test_user_recall_df.apply(lambda x: np.nanmin(user_item_hot_dict[x['user_id']]), axis=1)
    # test_user_recall_df['user_item_max_deg'] = \
    #     test_user_recall_df.apply(lambda x: np.nanmax(user_item_hot_dict[x['user_id']]), axis=1)
    # if is_caching_features:
    #     print('user点击序中item热度统计特征--正在缓存test_user_recall_df')
    #     test_user_recall_df.to_csv(
    #         './cache/features_cache/part2_test_user_recall_features_phase_{}.csv'.format(phase), index=False)
    #
    # ''' user平均点击间隔、最大点击间隔、最小点击间隔 -- 需要分train和test两个集合统计 '''
    # train_time_interval_df = \
    #     click_df[click_df['train_or_test'] == 'train'].groupby('user_id').agg({'time': lambda x: ','.join([str(i) for i in list(x)])}).reset_index()
    # train_time_interval_df.columns = ['user_id', 'time_interval_arr']
    # train_time_interval_df['time_interval_arr'] = train_time_interval_df.apply(
    #     lambda x: np.array(list(x['time_interval_arr'].split(',')), dtype=np.float)[:-1] -
    #               np.array(list(x['time_interval_arr'].split(',')), dtype=np.float)[1:],
    #     axis=1
    # )
    # train_time_interval_dict = utils.two_columns_df2dict(train_time_interval_df)
    #
    # train_data['user_click_interval_mean'] = \
    #     train_data.apply(lambda x: np.nanmean(train_time_interval_dict[x['user_id']]), axis=1)
    # train_data['user_click_interval_min'] = \
    #     train_data.apply(lambda x: np.nanmin(train_time_interval_dict[x['user_id']]), axis=1)
    # train_data['user_click_interval_max'] = \
    #     train_data.apply(lambda x: np.nanmax(train_time_interval_dict[x['user_id']]), axis=1)
    # if is_caching_features:
    #     print('用户点击时间间隔特征--正在缓存train_data')
    #     train_data.to_csv('./cache/features_cache/part2_train_features_phase_{}.csv'.format(phase), index=False)
    #
    # valid_data['user_click_interval_mean'] = \
    #     valid_data.apply(lambda x: np.nanmean(train_time_interval_dict[x['user_id']]), axis=1)
    # valid_data['user_click_interval_min'] = \
    #     valid_data.apply(lambda x: np.nanmin(train_time_interval_dict[x['user_id']]), axis=1)
    # valid_data['user_click_interval_max'] = \
    #     valid_data.apply(lambda x: np.nanmax(train_time_interval_dict[x['user_id']]), axis=1)
    # if is_caching_features:
    #     print('用户点击时间间隔特征--正在缓存valid_data')
    #     valid_data.to_csv('./cache/features_cache/part2_valid_features_phase_{}.csv'.format(phase), index=False)
    #
    # if is_open_train_recall:
    #     train_user_recall_df['user_click_interval_mean'] = \
    #         train_user_recall_df.apply(lambda x: np.nanmean(train_time_interval_dict[x['user_id']]), axis=1)
    #     train_user_recall_df['user_click_interval_min'] = \
    #         train_user_recall_df.apply(lambda x: np.nanmin(train_time_interval_dict[x['user_id']]), axis=1)
    #     train_user_recall_df['user_click_interval_max'] = \
    #         train_user_recall_df.apply(lambda x: np.nanmax(train_time_interval_dict[x['user_id']]), axis=1)
    #     if is_caching_features:
    #         print('用户点击时间间隔特征--正在缓存train_user_recall_df')
    #         train_user_recall_df.to_csv('./cache/features_cache/part2_train_user_recall_features_phase_{}.csv'.format(phase), index=False)
    #
    # test_time_interval_df = \
    #     click_df[click_df['train_or_test'] == 'test'].groupby('user_id').agg({'time': lambda x: ','.join([str(i) for i in list(x)])}).reset_index()
    # test_time_interval_df.columns = ['user_id', 'time_interval_arr']
    # test_time_interval_df['time_interval_arr'] = test_time_interval_df.apply(
    #     lambda x: np.array(list(x['time_interval_arr'].split(',')), dtype=np.float)[:-1] -
    #               np.array(list(x['time_interval_arr'].split(',')), dtype=np.float)[1:],
    #     axis=1
    # )
    # test_time_interval_dict = utils.two_columns_df2dict(test_time_interval_df)
    #
    # test_user_recall_df['user_click_interval_mean'] = \
    #     test_user_recall_df.apply(
    #         lambda x: np.nanmean(test_time_interval_dict[x['user_id']]) if 0 != len(test_time_interval_dict[x['user_id']]) else np.nan,
    #         axis=1
    #     )
    # test_user_recall_df['user_click_interval_min'] = \
    #     test_user_recall_df.apply(
    #         lambda x: np.nanmin(test_time_interval_dict[x['user_id']]) if 0 != len(test_time_interval_dict[x['user_id']]) else np.nan,
    #         axis=1
    #     )
    # test_user_recall_df['user_click_interval_max'] = \
    #     test_user_recall_df.apply(
    #         lambda x: np.nanmax(test_time_interval_dict[x['user_id']]) if 0 != len(test_time_interval_dict[x['user_id']]) else np.nan,
    #         axis=1
    #     )
    # if is_caching_features:
    #     print('用户点击时间间隔特征--正在缓存test_user_recall_df')
    #     test_user_recall_df.to_csv('./cache/features_cache/part2_test_user_recall_features_phase_{}.csv'.format(phase), index=False)

    # '''
    # 暂时关系, 此特征有问题,存在数据泄露
    # 样本中user和item的距离--如果item在user点击序中则根据时间排序当做距离,否则设为最大距离(最近一个点击距离0)
    # 由于train和test集合user_id不重复,所以不需要分开
    # '''
    # user_clicked_items_df = click_df.groupby('user_id').agg({'item_id': lambda x: ','.join(list(x))}).reset_index()
    # user_clicked_items_df.columns = ['user_id', 'item_id_arr']
    # user_clicked_items_df['item_id_arr'] = user_clicked_items_df.apply(
    #     lambda x: list(x['item_id_arr'].split(',')), axis=1)
    # user_clicked_items_dict = utils.two_columns_df2dict(user_clicked_items_df)
    #
    # # TODO  巨大问题, 训练集中仅有500和0 而实际上还有1 2 3 等
    # train_data['item_distance'] = train_data.apply(
    #     lambda x: list(user_clicked_items_dict[x['user_id']]).index(x['item_id'])
    #     if x['item_id'] in user_clicked_items_dict[x['user_id']] else constant.MAX_CLICK_LEN,
    #     axis=1
    # )
    # if is_caching_features:
    #     print('样本中user和item的距离--正在缓存train_data')
    #     train_data.to_csv('./cache/features_cache/part2_train_features_phase_{}.csv'.format(phase), index=False)
    #
    # valid_data['item_distance'] = valid_data.apply(
    #     lambda x: list(user_clicked_items_dict[x['user_id']]).index(x['item_id'])
    #     if x['item_id'] in user_clicked_items_dict[x['user_id']] else constant.MAX_CLICK_LEN,
    #     axis=1
    # )
    # if is_caching_features:
    #     print('样本中user和item的距离--正在缓存valid_data')
    #     valid_data.to_csv('./cache/features_cache/part2_valid_features_phase_{}.csv'.format(phase), index=False)
    #
    # if is_open_train_recall:
    #     train_user_recall_df['item_distance'] = train_user_recall_df.apply(
    #         lambda x: list(user_clicked_items_dict[x['user_id']]).index(x['item_id'])
    #         if x['item_id'] in user_clicked_items_dict[x['user_id']] else constant.MAX_CLICK_LEN,
    #         axis=1
    #     )
    #     if is_caching_features:
    #         print('样本中user和item的距离--正在缓存train_user_recall_df')
    #         train_user_recall_df.to_csv('./cache/features_cache/part2_train_user_recall_features_phase_{}.csv'.format(phase), index=False)
    #
    # # TODO 这个好像都没有匹配上 都是最大值500?
    # test_user_recall_df['item_distance'] = test_user_recall_df.apply(
    # lambda x: list(user_clicked_items_dict[x['user_id']]).index(x['item_id'])
    # if x['item_id'] in user_clicked_items_dict[x['user_id']] else constant.MAX_CLICK_LEN,
    # axis=1
    # )
    # if is_caching_features:
    #     print('样本中user和item的距离--正在缓存test_user_recall_df')
    #     test_user_recall_df.to_csv('./cache/features_cache/part2_test_user_recall_features_phase_{}.csv'.format(phase), index=False)

    # data_dict = {
    #     0: train_data, 1: valid_data, 2: train_user_recall_df, 3: test_user_recall_df
    # }
    # process_func_dict = {
    #     0: cal_item_distance
    # }
    # total_statistic_features_dict_list = \
    #     cal_total_statistic_features(data_dict, click_df, len(list(data_dict.keys())), process_func_dict)
    #
    # for total_statistic_features_dict in total_statistic_features_dict_list:
    #     if total_statistic_features_dict.get(0):
    #         for statistic_feature in total_statistic_features_dict[0]:
    #             for k, v in statistic_feature.items():
    #                 train_data[k] = v
    #     elif total_statistic_features_dict.get(1):
    #         for statistic_feature in total_statistic_features_dict[1]:
    #             for k, v in statistic_feature.items():
    #                 valid_data[k] = v
    #     elif total_statistic_features_dict.get(2):
    #         for statistic_feature in total_statistic_features_dict[2]:
    #             for k, v in statistic_feature.items():
    #                 train_user_recall_df[k] = v
    #     elif total_statistic_features_dict.get(3):
    #         for statistic_feature in total_statistic_features_dict[3]:
    #             for k, v in statistic_feature.items():
    #                 test_user_recall_df[k] = v

    # print(train_data.iloc[:5, :])
    # print(valid_data.iloc[:5, :])
    # print(train_user_recall_df.iloc[:5, :])
    # print(test_user_recall_df.iloc[:5, :])

    if is_open_train_recall:
        train_data, valid_data, train_user_recall_df, test_user_recall_df = process_after_featuring(
            train_data, valid_data, train_user_recall_df, test_user_recall_df,
            is_open_train_recall)
    else:
        train_data, valid_data, test_user_recall_df = process_after_featuring(
            train_data, valid_data, None, test_user_recall_df,
            is_open_train_recall)
        train_user_recall_df = None

    print(train_data.iloc[:5, :])
    print(valid_data.iloc[:5, :])
    if is_open_train_recall:
        print(train_user_recall_df.iloc[:5, :])
    print(test_user_recall_df.iloc[:5, :])

    return train_data, valid_data, train_user_recall_df, test_user_recall_df