all_phase_click_no_qtime['user_id'])] print('load features, shape:{}'.format(feature_df.shape)) else: feature_df = do_featuring( all_phase_click_no_qtime, sample_df, hot_df, conf.process_num, item_txt_embedding_dim, is_recall=False, feature_caching_path=conf.features_cache_path) assert sample_df.shape[0] == feature_df.shape[0] assert len(set(sample_df['user_id'])) == len(set(feature_df['user_id'])) ''' 训练集/验证集划分 ''' train_df, valid_df = train_test_split(feature_df) train_x = train_df[train_df.columns.difference( ['user_id', 'item_id', 'label'])].values train_y = train_df['label'].values valid_x = valid_df[valid_df.columns.difference( ['user_id', 'item_id', 'label'])].values valid_y = valid_df['label'].values ''' 模型训练 ''' time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('------------------------ 模型训练 start time:{}'.format(time_str)) # submit = train_model_lgb(feature_all, recall_rate=hit_rate, hot_list=hot_list, valid=0.2, topk=50, num_boost_round=1, early_stopping_rounds=1) # submit = train_model_rf(train_test, recall_rate=1, hot_list=hot_list, valid=0.2, topk=50) model = rank_rf(train_x, train_y) # model = rank_xgb(train_x, train_y) print('train set: auc:{}'.format(
def _get_model(params): feature_df = params.get("feature") hot_df = params.get("hot_df") eta = None if 'eta' in params: eta = params['eta'] min_child_weight = None if 'min_child_weight' in params: min_child_weight = params['min_child_weight'] max_depth = None if 'max_depth' in params: max_depth = int(params['max_depth']) gamma = None if 'gamma' in params: gamma = params['gamma'] subsample = None if 'subsample' in params: subsample = params['subsample'] colsample_bytree = None if 'colsample_bytree' in params: colsample_bytree = params['colsample_bytree'] reg_lambda = None if 'reg_lambda' in params: reg_lambda = params['reg_lambda'] scale_pos_weight = None if 'scale_pos_weight' in params: scale_pos_weight = params['scale_pos_weight'] tree_method = None if 'tree_method' in params: tree_method = params['tree_method'] n_estimators = None if 'n_estimators' in params: n_estimators = int(params['n_estimators']) train_auc = valid_auc = 0 pre_score_arr = np.zeros(5).reshape(-1, ) rank_score_arr = np.zeros(5).reshape(-1, ) for i in range(conf.k): ''' 训练集/验证集划分 ''' train_df, valid_df = featuring.train_test_split(feature_df, seed=1) train_x = train_df[train_df.columns.difference( ['user_id', 'item_id', 'label'])].values train_y = train_df['label'].values valid_df = valid_df.sort_values('sim').reset_index(drop=True) valid_x = valid_df[valid_df.columns.difference( ['user_id', 'item_id', 'label'])].values valid_y = valid_df['label'].values ''' 模型训练 ''' model = rank.rank_xgb(train_x, train_y, eta=eta, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, tree_method=tree_method, n_estimators=n_estimators) one_train_auc = roc_auc_score(train_y, model.predict_proba(train_x)[:, 1]) train_auc += one_train_auc ''' 模型验证 ''' pre_y = model.predict_proba(valid_x)[:, 1] one_valid_auc = roc_auc_score(valid_y, pre_y) valid_auc += one_valid_auc answer = eval.make_answer(valid_df[valid_df['label'] == 1], hot_df, phase=1) pre_score_arr += eval.my_eval(list(valid_df['sim']), valid_df, answer, print_mark=False) rank_score_arr += eval.my_eval(pre_y, valid_df, answer, print_mark=False) avg_valid_auc = valid_auc / conf.k avg_pre_ndcg = pre_score_arr / conf.k avg_rank_ndcg = rank_score_arr / conf.k diff = avg_rank_ndcg - avg_pre_ndcg print('avg valid auc:{}, ndcg full gain:{}, ndcg half gain:{}'.format( avg_valid_auc, diff[0], diff[2])) return -diff[2]
total_feature_df = utils.get_features(total_feature_df, is_label=1, type=0) print('feature shape:{}, positive feature num:{}'.format( total_feature_df.shape, total_feature_df[total_feature_df['label'] == 1].shape[0])) # 这里的hot_df与训练集不是同步的,暂时凑合着用 hot_df = all_phase_click_no_qtime.groupby( 'item_id')['user_id'].count().reset_index() hot_df.columns = ['item_id', 'item_deg'] hot_df = hot_df.sort_values('item_deg', ascending=False).reset_index(drop=True) train_auc = valid_auc = 0 pre_score_arr = np.zeros(5).reshape(-1, ) rank_score_arr = np.zeros(5).reshape(-1, ) for i in range(conf.k): ''' 训练集/验证集划分 ''' train_df, valid_df = train_test_split(total_feature_df) qtime_user_df = all_phase_click[all_phase_click['train_or_test'] == 'predict'] print('训练集命中{}个qtime的user.'.format( len( set(train_df[train_df['user_id'].isin( qtime_user_df['user_id'])]['user_id'])))) train_x = train_df[train_df.columns.difference( ['user_id', 'item_id', 'label', 'truth_item_id'])].values train_y = train_df['label'].values valid_df = valid_df.sort_values('sim').reset_index(drop=True) valid_x = valid_df[valid_df.columns.difference( ['user_id', 'item_id', 'label', 'truth_item_id'])].values
def do_featuring(click_df, item_info_df, user_info_df, user_item_dict, train_user_recall_df, test_user_recall_df, sim_matrix, hot_df): """ :param click_df: :param item_info_df: :param user_info_df: :return: """ ''' 集合划分 ''' # 训练集 测试集 # 负样本采样:从所有点击历史中采样非正样本item # TODO 从官方给的item表中采样、从点击+item表中采样 # TODO many todo in sampling_negtive_samples # todo 只用train去负采样,后面尝试train和test一起 if is_data_set_cached: print('reading train/valid ... set') train_data = pd.read_csv( './cache/features_cache/train_data_{}.csv'.format(phase), dtype={ 'user_id': np.str, 'item_id': np.str, 'label': np.int }) valid_data = pd.read_csv( './cache/features_cache/valid_data_{}.csv'.format(phase), dtype={ 'user_id': np.str, 'item_id': np.str, 'label': np.int }) train_user_recall_df = pd.read_csv( './cache/features_cache/train_user_recall_{}.csv'.format(phase), dtype={ 'user_id': np.str, 'item_id': np.str, 'label': np.int, 'itemcf_score': np.float }) test_user_recall_df = pd.read_csv( './cache/features_cache/test_user_recall_{}.csv'.format(phase), dtype={ 'user_id': np.str, 'item_id': np.str, 'itemcf_score': np.float }) print(train_data.shape) print(valid_data.shape) print(train_user_recall_df.shape) print(test_user_recall_df.shape) else: train_test_df = click_df[click_df['train_or_test'] == 'train'][[ 'user_id', 'item_id' ]] user_set = set(train_test_df['user_id']) negtive_features = utils.sampling_negtive_samples(user_set, train_test_df, sample_num=10) # features = features.merge(user_features, on='user_id', how='left') # features = features.merge(item_info_df, on='item_id', how='left') negtive_features['label'] = 0 # time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # print('正样本特征 start time:{}'.format(time_str)) # 正样本加入 positive_features = pd.DataFrame() positive_features['user_id'] = list(user_item_dict.keys()) positive_features['item_id'] = list(user_item_dict.values()) # positive_features = positive_features.merge(user_features, on='user_id', how='left') # positive_features = positive_features.merge(item_info_df, on='item_id', how='left') positive_features['label'] = 1 # positive_features['train_or_test'] = 'train' # 正负样本合并 features = negtive_features.append(positive_features).reset_index( drop=True) # features.sort_values(by='user_id', inplace=True) # features.reset_index(drop=True, inplace=True) # time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # print('训练集验证集划分 start time:{}'.format(time_str)) # TODO many todo in train_test_split train_data, valid_data = train_test_split(features, 0.8) # 训练集召回结果,此部分用于验证上述训练集训练出来的模型 train_user_recall_df = train_user_recall_df[[ 'user_id', 'item_id', 'itemcf_score' ]] train_user_recall_df['label'] = 0 train_user_recall_df.loc[ train_user_recall_df['user_id'].isin(list(user_item_dict.keys())) & train_user_recall_df['item_id']. isin(list(user_item_dict.values())), 'label'] = 1 # 测试集召回结果,此部分用于提交 test_user_recall_df = test_user_recall_df[[ 'user_id', 'item_id', 'itemcf_score' ]] # if is_data_set_caching: # print('caching splited data.', # train_data.shape, valid_data.shape, train_user_recall_df.shape, test_user_recall_df.shape) # train_data.to_csv('./cache/features_cache/train_data_{}.csv'.format(phase), index=False) # valid_data.to_csv('./cache/features_cache/valid_data_{}.csv'.format(phase), index=False) # train_user_recall_df.to_csv('./cache/features_cache/train_user_recall_{}.csv'.format(phase), index=False) # test_user_recall_df.to_csv('./cache/features_cache/test_user_recall_{}.csv'.format(phase), index=False) print(np.sum(train_data['user_id'].isin(click_df['user_id'])), ',', np.sum(click_df['user_id'].isin(train_data['user_id']))) ''' itemCF相似度: ''' time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('itemCF相似度特征 start time:{}'.format(time_str)) train_data['itemcf_score'] = np.nan train_data.loc[:, 'itemcf_score'] = train_data.apply( lambda x: sim_matrix[x['user_id']][x['item_id']] if sim_matrix.get(x['user_id']) is not None and sim_matrix.get(x[ 'user_id']).get(x['item_id']) is not None else np.nan, axis=1) print(train_data) valid_data['itemcf_score'] = np.nan valid_data.loc[:, 'itemcf_score'] = valid_data.apply( lambda x: sim_matrix[x['user_id']][x['item_id']] if sim_matrix.get(x['user_id']) is not None and sim_matrix.get(x[ 'user_id']).get(x['item_id']) is not None else np.nan, axis=1) # 把负数统一洗成0 TODO 带来的问题:可能非常稀疏 # train_user_recall_df.loc[train_user_recall_df['itemcf_score'] < 0, 'itemcf_score'] = 0 # test_user_recall_df.loc[test_user_recall_df['itemcf_score'] < 0, 'itemcf_score'] = 0 ''' 官方特征: 1. user和item之间txt相似度 2. user和item之间img相似度 ''' time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('官方特征 start time:{}'.format(time_str)) # 注意,此处click_df中user_id必须包含上述四个集合user total_set_user = set(train_data['user_id']).union( set(valid_data['user_id'])).union(set( train_user_recall_df['user_id'])).union( set(test_user_recall_df['user_id'])) assert (0 == len(set(click_df['user_id']).difference(total_set_user)) and 0 == len(total_set_user.difference(set(click_df['user_id'])))) user_features = get_user_features(click_df, item_info_df, item_txt_embedding_dim, item_img_embedding_dim) user_features_dict = transfer_user_features_df2dict( user_features, item_txt_embedding_dim) item_features_dict = transfer_item_features_df2dict( item_info_df, item_txt_embedding_dim) assert item_txt_embedding_dim == item_img_embedding_dim # 每计算好一个数据集就缓存下来 train_data = cal_txt_img_sim(train_data, user_features_dict, item_features_dict, item_img_embedding_dim, process_num) if is_caching_features: print('正在缓存train_data') train_data.to_csv( './cache/features_cache/part0_train_features_phase_{}.csv'.format( phase), index=False) print(train_data) valid_data = cal_txt_img_sim(valid_data, user_features_dict, item_features_dict, item_img_embedding_dim, process_num) if is_caching_features: print('正在缓存valid_data') valid_data.to_csv( './cache/features_cache/part0_valid_features_phase_{}.csv'.format( phase), index=False) if is_open_train_recall: train_user_recall_df = cal_txt_img_sim(train_user_recall_df, user_features_dict, item_features_dict, item_img_embedding_dim, process_num) if is_caching_features: print('正在缓存train_user_recall_df') train_user_recall_df.to_csv( './cache/features_cache/part0_train_user_recall_features_phase_{}.csv' .format(phase), index=False) test_user_recall_df = cal_txt_img_sim(test_user_recall_df, user_features_dict, item_features_dict, item_img_embedding_dim, process_num) if is_caching_features: print('正在缓存test_user_recall_df') test_user_recall_df.to_csv( './cache/features_cache/part0_test_user_recall_features_phase_{}.csv' .format(phase), index=False) ''' 点击序: 1. 纯item序列 -- 砍掉 2. item序列和对应user -- 砍掉 3. 纯user序列 -- 砍掉 4. user序列和共同item -- 砍掉 5. 2 带来的user和item相似度 6. 4 带来的user和item相似度 ''' time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('点击序embedding特征 start time:{}'.format(time_str)) dict_embedding_all_ui_item, dict_embedding_all_ui_user = click_embedding( click_df, item_img_embedding_dim) train_data = cal_click_sim(train_data, dict_embedding_all_ui_item, dict_embedding_all_ui_user, process_num) if is_caching_features: print('正在缓存train_data') train_data.to_csv( './cache/features_cache/part0_train_features_phase_{}.csv'.format( phase), index=False) valid_data = cal_click_sim(valid_data, dict_embedding_all_ui_item, dict_embedding_all_ui_user, process_num) if is_caching_features: print('正在缓存valid_data') valid_data.to_csv( './cache/features_cache/part0_valid_features_phase_{}.csv'.format( phase), index=False) if is_open_train_recall: train_user_recall_df = cal_click_sim(train_user_recall_df, dict_embedding_all_ui_item, dict_embedding_all_ui_user, process_num) if is_caching_features: print('正在缓存train_user_recall_df') train_user_recall_df.to_csv( './cache/features_cache/part0_train_user_recall_features_phase_{}.csv' .format(phase), index=False) test_user_recall_df = cal_click_sim(test_user_recall_df, dict_embedding_all_ui_item, dict_embedding_all_ui_user, process_num) if is_caching_features: print('正在缓存test_user_recall_df') test_user_recall_df.to_csv( './cache/features_cache/part0_test_user_recall_features_phase_{}.csv' .format(phase), index=False) print(train_data.columns) print(train_data.iloc[:5, :]) print(valid_data.iloc[:5, :]) print(train_user_recall_df.iloc[:5, :]) print(test_user_recall_df.iloc[:5, :]) # ''' # 统计特征: # 一阶特征: # user点击序中user点击次数(即 点击深度 TODO 去做个统计:点击深度和冷门物品偏好的关系) -- 全量数据集统计 # user点击序中item平均热度、最大热度、最小热度 -- 先不分train和test即使用全量数据集统计,调优的时候再分 # user平均点击间隔、最大点击间隔、最小点击间隔 -- 需要分train和test两个集合统计 # 本item在全局的热度:先使用全量数据集统计,调优的时候分在train、test、item-feature中的热度 # 二阶特征(样本中user和item交互): # 样本中user和item的距离--如果item在user点击序中则根据时间排序当做距离,否则设为最大距离(最近一个点击距离0) # ? 用户热度--用户点击序中所有item热度和 # ''' # time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # print('统计特征 start time:{}'.format(time_str)) # # click_df = click_df.sort_values(['user_id', 'time'], ascending=False).reset_index(drop=True) # # ''' user点击序中user点击次数(即 点击深度 TODO 去做个统计:点击深度和冷门物品偏好的关系) -- 全量数据集统计 ''' # user_click_num_df = click_df.groupby('user_id')['item_id'].count().reset_index() # user_click_num_df.columns = ['user_id', 'user_click_num'] # user_click_dict = utils.two_columns_df2dict(user_click_num_df) # # train_data['user_click_num'] = train_data.apply( # lambda x: user_click_dict[x['user_id']] if user_click_dict.get(x['user_id']) else 0, axis=1) # if is_caching_features: # print('用户点击次数特征--正在缓存train_data') # train_data.to_csv('./cache/features_cache/part2_train_features_phase_{}.csv'.format(phase), index=False) # # valid_data['user_click_num'] = valid_data.apply( # lambda x: user_click_dict[x['user_id']] if user_click_dict.get(x['user_id']) else 0, axis=1) # if is_caching_features: # print('用户点击次数特征--正在缓存valid_data') # valid_data.to_csv('./cache/features_cache/part2_valid_features_phase_{}.csv'.format(phase), index=False) # # if is_open_train_recall: # train_user_recall_df['user_click_num'] = train_user_recall_df.apply( # lambda x: user_click_dict[x['user_id']] if user_click_dict.get(x['user_id']) else 0, axis=1) # if is_caching_features: # print('用户点击次数特征--正在缓存train_user_recall_df') # train_user_recall_df.to_csv( # './cache/features_cache/part2_train_user_recall_features_phase_{}.csv'.format(phase), index=False) # # test_user_recall_df['user_click_num'] = test_user_recall_df.apply( # lambda x: user_click_dict[x['user_id']] if user_click_dict.get(x['user_id']) else 0, axis=1) # if is_caching_features: # print('用户点击次数特征--正在缓存test_user_recall_df') # test_user_recall_df.to_csv( # './cache/features_cache/part2_test_user_recall_features_phase_{}.csv'.format(phase), index=False) # # # ''' 本item在全局的热度:先使用全量数据集统计,调优的时候分在train、test、item-feature中的热度 ''' # print('item在全局的热度 doing') # train_data = train_data.merge(hot_df, on='item_id', how='left') # valid_data = valid_data.merge(hot_df, on='item_id', how='left') # if is_open_train_recall: # train_user_recall_df = train_user_recall_df.merge(hot_df, on='item_id', how='left') # test_user_recall_df = test_user_recall_df.merge(hot_df, on='item_id', how='left') # # ''' user点击序中item平均热度、最大热度、最小热度 -- 先不分train和test即使用全量数据集统计,调优的时候再分 ''' # click_df = click_df.merge(hot_df, on='item_id', how='left') # user_item_hot_df = \ # click_df.groupby('user_id').agg({'item_deg': lambda x: ','.join([str(i) for i in list(x)])}).reset_index() # user_item_hot_df.columns = ['user_id', 'item_hot_arr'] # user_item_hot_df['item_hot_arr'] = user_item_hot_df.apply( # lambda x: np.array(list(x['item_hot_arr'].split(',')), dtype=np.int), axis=1) # user_item_hot_dict = utils.two_columns_df2dict(user_item_hot_df) # # train_data['user_item_mean_deg'] = \ # train_data.apply(lambda x: np.nanmean(user_item_hot_dict[x['user_id']]), axis=1) # train_data['user_item_min_deg'] = \ # train_data.apply(lambda x: np.nanmin(user_item_hot_dict[x['user_id']]), axis=1) # train_data['user_item_max_deg'] = \ # train_data.apply(lambda x: np.nanmax(user_item_hot_dict[x['user_id']]), axis=1) # if is_caching_features: # print('user点击序中item热度统计特征--正在缓存train_data') # train_data.to_csv('./cache/features_cache/part2_train_features_phase_{}.csv'.format(phase), index=False) # # valid_data['user_item_mean_deg'] = \ # valid_data.apply(lambda x: np.nanmean(user_item_hot_dict[x['user_id']]), axis=1) # valid_data['user_item_min_deg'] = \ # valid_data.apply(lambda x: np.nanmin(user_item_hot_dict[x['user_id']]), axis=1) # valid_data['user_item_max_deg'] = \ # valid_data.apply(lambda x: np.nanmax(user_item_hot_dict[x['user_id']]), axis=1) # if is_caching_features: # print('user点击序中item热度统计特征--正在缓存valid_data') # valid_data.to_csv('./cache/features_cache/part2_valid_features_phase_{}.csv'.format(phase), index=False) # # if is_open_train_recall: # train_user_recall_df['user_item_mean_deg'] = \ # train_user_recall_df.apply(lambda x: np.nanmean(user_item_hot_dict[x['user_id']]), axis=1) # train_user_recall_df['user_item_min_deg'] = \ # train_user_recall_df.apply(lambda x: np.nanmin(user_item_hot_dict[x['user_id']]), axis=1) # train_user_recall_df['user_item_max_deg'] = \ # train_user_recall_df.apply(lambda x: np.nanmax(user_item_hot_dict[x['user_id']]), axis=1) # if is_caching_features: # print('user点击序中item热度统计特征--正在缓存train_user_recall_df') # train_user_recall_df.to_csv( # './cache/features_cache/part2_train_user_recall_features_phase_{}.csv'.format(phase), index=False) # # test_user_recall_df['user_item_mean_deg'] = \ # test_user_recall_df.apply(lambda x: np.nanmean(user_item_hot_dict[x['user_id']]), axis=1) # test_user_recall_df['user_item_min_deg'] = \ # test_user_recall_df.apply(lambda x: np.nanmin(user_item_hot_dict[x['user_id']]), axis=1) # test_user_recall_df['user_item_max_deg'] = \ # test_user_recall_df.apply(lambda x: np.nanmax(user_item_hot_dict[x['user_id']]), axis=1) # if is_caching_features: # print('user点击序中item热度统计特征--正在缓存test_user_recall_df') # test_user_recall_df.to_csv( # './cache/features_cache/part2_test_user_recall_features_phase_{}.csv'.format(phase), index=False) # # ''' user平均点击间隔、最大点击间隔、最小点击间隔 -- 需要分train和test两个集合统计 ''' # train_time_interval_df = \ # click_df[click_df['train_or_test'] == 'train'].groupby('user_id').agg({'time': lambda x: ','.join([str(i) for i in list(x)])}).reset_index() # train_time_interval_df.columns = ['user_id', 'time_interval_arr'] # train_time_interval_df['time_interval_arr'] = train_time_interval_df.apply( # lambda x: np.array(list(x['time_interval_arr'].split(',')), dtype=np.float)[:-1] - # np.array(list(x['time_interval_arr'].split(',')), dtype=np.float)[1:], # axis=1 # ) # train_time_interval_dict = utils.two_columns_df2dict(train_time_interval_df) # # train_data['user_click_interval_mean'] = \ # train_data.apply(lambda x: np.nanmean(train_time_interval_dict[x['user_id']]), axis=1) # train_data['user_click_interval_min'] = \ # train_data.apply(lambda x: np.nanmin(train_time_interval_dict[x['user_id']]), axis=1) # train_data['user_click_interval_max'] = \ # train_data.apply(lambda x: np.nanmax(train_time_interval_dict[x['user_id']]), axis=1) # if is_caching_features: # print('用户点击时间间隔特征--正在缓存train_data') # train_data.to_csv('./cache/features_cache/part2_train_features_phase_{}.csv'.format(phase), index=False) # # valid_data['user_click_interval_mean'] = \ # valid_data.apply(lambda x: np.nanmean(train_time_interval_dict[x['user_id']]), axis=1) # valid_data['user_click_interval_min'] = \ # valid_data.apply(lambda x: np.nanmin(train_time_interval_dict[x['user_id']]), axis=1) # valid_data['user_click_interval_max'] = \ # valid_data.apply(lambda x: np.nanmax(train_time_interval_dict[x['user_id']]), axis=1) # if is_caching_features: # print('用户点击时间间隔特征--正在缓存valid_data') # valid_data.to_csv('./cache/features_cache/part2_valid_features_phase_{}.csv'.format(phase), index=False) # # if is_open_train_recall: # train_user_recall_df['user_click_interval_mean'] = \ # train_user_recall_df.apply(lambda x: np.nanmean(train_time_interval_dict[x['user_id']]), axis=1) # train_user_recall_df['user_click_interval_min'] = \ # train_user_recall_df.apply(lambda x: np.nanmin(train_time_interval_dict[x['user_id']]), axis=1) # train_user_recall_df['user_click_interval_max'] = \ # train_user_recall_df.apply(lambda x: np.nanmax(train_time_interval_dict[x['user_id']]), axis=1) # if is_caching_features: # print('用户点击时间间隔特征--正在缓存train_user_recall_df') # train_user_recall_df.to_csv('./cache/features_cache/part2_train_user_recall_features_phase_{}.csv'.format(phase), index=False) # # test_time_interval_df = \ # click_df[click_df['train_or_test'] == 'test'].groupby('user_id').agg({'time': lambda x: ','.join([str(i) for i in list(x)])}).reset_index() # test_time_interval_df.columns = ['user_id', 'time_interval_arr'] # test_time_interval_df['time_interval_arr'] = test_time_interval_df.apply( # lambda x: np.array(list(x['time_interval_arr'].split(',')), dtype=np.float)[:-1] - # np.array(list(x['time_interval_arr'].split(',')), dtype=np.float)[1:], # axis=1 # ) # test_time_interval_dict = utils.two_columns_df2dict(test_time_interval_df) # # test_user_recall_df['user_click_interval_mean'] = \ # test_user_recall_df.apply( # lambda x: np.nanmean(test_time_interval_dict[x['user_id']]) if 0 != len(test_time_interval_dict[x['user_id']]) else np.nan, # axis=1 # ) # test_user_recall_df['user_click_interval_min'] = \ # test_user_recall_df.apply( # lambda x: np.nanmin(test_time_interval_dict[x['user_id']]) if 0 != len(test_time_interval_dict[x['user_id']]) else np.nan, # axis=1 # ) # test_user_recall_df['user_click_interval_max'] = \ # test_user_recall_df.apply( # lambda x: np.nanmax(test_time_interval_dict[x['user_id']]) if 0 != len(test_time_interval_dict[x['user_id']]) else np.nan, # axis=1 # ) # if is_caching_features: # print('用户点击时间间隔特征--正在缓存test_user_recall_df') # test_user_recall_df.to_csv('./cache/features_cache/part2_test_user_recall_features_phase_{}.csv'.format(phase), index=False) # ''' # 暂时关系, 此特征有问题,存在数据泄露 # 样本中user和item的距离--如果item在user点击序中则根据时间排序当做距离,否则设为最大距离(最近一个点击距离0) # 由于train和test集合user_id不重复,所以不需要分开 # ''' # user_clicked_items_df = click_df.groupby('user_id').agg({'item_id': lambda x: ','.join(list(x))}).reset_index() # user_clicked_items_df.columns = ['user_id', 'item_id_arr'] # user_clicked_items_df['item_id_arr'] = user_clicked_items_df.apply( # lambda x: list(x['item_id_arr'].split(',')), axis=1) # user_clicked_items_dict = utils.two_columns_df2dict(user_clicked_items_df) # # # TODO 巨大问题, 训练集中仅有500和0 而实际上还有1 2 3 等 # train_data['item_distance'] = train_data.apply( # lambda x: list(user_clicked_items_dict[x['user_id']]).index(x['item_id']) # if x['item_id'] in user_clicked_items_dict[x['user_id']] else constant.MAX_CLICK_LEN, # axis=1 # ) # if is_caching_features: # print('样本中user和item的距离--正在缓存train_data') # train_data.to_csv('./cache/features_cache/part2_train_features_phase_{}.csv'.format(phase), index=False) # # valid_data['item_distance'] = valid_data.apply( # lambda x: list(user_clicked_items_dict[x['user_id']]).index(x['item_id']) # if x['item_id'] in user_clicked_items_dict[x['user_id']] else constant.MAX_CLICK_LEN, # axis=1 # ) # if is_caching_features: # print('样本中user和item的距离--正在缓存valid_data') # valid_data.to_csv('./cache/features_cache/part2_valid_features_phase_{}.csv'.format(phase), index=False) # # if is_open_train_recall: # train_user_recall_df['item_distance'] = train_user_recall_df.apply( # lambda x: list(user_clicked_items_dict[x['user_id']]).index(x['item_id']) # if x['item_id'] in user_clicked_items_dict[x['user_id']] else constant.MAX_CLICK_LEN, # axis=1 # ) # if is_caching_features: # print('样本中user和item的距离--正在缓存train_user_recall_df') # train_user_recall_df.to_csv('./cache/features_cache/part2_train_user_recall_features_phase_{}.csv'.format(phase), index=False) # # # TODO 这个好像都没有匹配上 都是最大值500? # test_user_recall_df['item_distance'] = test_user_recall_df.apply( # lambda x: list(user_clicked_items_dict[x['user_id']]).index(x['item_id']) # if x['item_id'] in user_clicked_items_dict[x['user_id']] else constant.MAX_CLICK_LEN, # axis=1 # ) # if is_caching_features: # print('样本中user和item的距离--正在缓存test_user_recall_df') # test_user_recall_df.to_csv('./cache/features_cache/part2_test_user_recall_features_phase_{}.csv'.format(phase), index=False) # data_dict = { # 0: train_data, 1: valid_data, 2: train_user_recall_df, 3: test_user_recall_df # } # process_func_dict = { # 0: cal_item_distance # } # total_statistic_features_dict_list = \ # cal_total_statistic_features(data_dict, click_df, len(list(data_dict.keys())), process_func_dict) # # for total_statistic_features_dict in total_statistic_features_dict_list: # if total_statistic_features_dict.get(0): # for statistic_feature in total_statistic_features_dict[0]: # for k, v in statistic_feature.items(): # train_data[k] = v # elif total_statistic_features_dict.get(1): # for statistic_feature in total_statistic_features_dict[1]: # for k, v in statistic_feature.items(): # valid_data[k] = v # elif total_statistic_features_dict.get(2): # for statistic_feature in total_statistic_features_dict[2]: # for k, v in statistic_feature.items(): # train_user_recall_df[k] = v # elif total_statistic_features_dict.get(3): # for statistic_feature in total_statistic_features_dict[3]: # for k, v in statistic_feature.items(): # test_user_recall_df[k] = v # print(train_data.iloc[:5, :]) # print(valid_data.iloc[:5, :]) # print(train_user_recall_df.iloc[:5, :]) # print(test_user_recall_df.iloc[:5, :]) if is_open_train_recall: train_data, valid_data, train_user_recall_df, test_user_recall_df = process_after_featuring( train_data, valid_data, train_user_recall_df, test_user_recall_df, is_open_train_recall) else: train_data, valid_data, test_user_recall_df = process_after_featuring( train_data, valid_data, None, test_user_recall_df, is_open_train_recall) train_user_recall_df = None print(train_data.iloc[:5, :]) print(valid_data.iloc[:5, :]) if is_open_train_recall: print(train_user_recall_df.iloc[:5, :]) print(test_user_recall_df.iloc[:5, :]) return train_data, valid_data, train_user_recall_df, test_user_recall_df