def gen_cate_property_cvr(test_day, data): """ 生成test_day之前全部cate-property对的转化率 """ cate_prop_dict_path = cache_pkl_path + 'cate_prop_cvr_day_{0}_dict.pkl'.format( test_day) if os.path.exists(cate_prop_dict_path): print('found ' + cate_prop_dict_path) return load_pickle(cate_prop_dict_path) cate_prop_cvr = [] if test_day != 18: real_data = data real_data = real_data[real_data['day'] < test_day] trade_data = real_data[real_data['is_trade'] == 1] all_cate_prop_cnt = gen_sorted_cate_property(real_data) trade_cate_prop_cnt = gen_sorted_cate_property(trade_data) cate_prop_cvr = trade_cate_prop_cnt #平滑滤波 all_cate_df = pd.DataFrame(all_cate_prop_cnt, columns=['cate_prop', 'I']) trade_cate_df = pd.DataFrame(trade_cate_prop_cnt, columns=['cate_prop', 'C']) all_cate_df = all_cate_df.merge(trade_cate_df, on='cate_prop', how='outer') all_cate_df.fillna(0, inplace=True) hyper = BayesianSmoothing(1, 1) hyper.update(all_cate_df['I'].values, all_cate_df['C'].values, 100, 0.00001) alpha = hyper.alpha beta = hyper.beta all_cate_df['cate_prop_cvr_smooth'] = (all_cate_df['C'] + alpha) / ( all_cate_df['I'] + alpha + beta) cate_prop_cvr = all_cate_df[['cate_prop', 'cate_prop_cvr_smooth']].values # #不平滑 # all_cate_prop_cnt = dict(all_cate_prop_cnt) # for i, cate_prop in enumerate(cate_prop_cvr): # cate_prop_cvr[i] = [cate_prop_cvr[i][0], 1.0*cate_prop[1]/(all_cate_prop_cnt[cate_prop[0]]+1)] return cate_prop_cvr
def gen_positionID_cvr_smooth(test_day): feature_path = feature_data_path + 'positionID_cvr_smooth_day_'+str(test_day)+'.pkl' if os.path.exists(feature_path): print('found '+feature_path) else: print('generating '+feature_path) data = load_pickle(raw_data_path+'train.pkl') data = addTime(data) positionID_cvr = data[data.clickDay < test_day] I = positionID_cvr.groupby('positionID')['label'].size().reset_index() I.columns = ['positionID', 'I'] C = positionID_cvr.groupby('positionID')['label'].sum().reset_index() C.columns = ['positionID', 'C'] positionID_cvr = pd.concat([I, C['C']], axis=1) hyper = BayesianSmoothing(1, 1) hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values, 10000, 0.00000001) alpha = hyper.alpha beta = hyper.beta positionID_cvr['positionID_cvr_smooth'] = (positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta) dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],feature_path)
def gen_shop_cvr_smooth(test_day, file_name='train'): ''' 获取店铺这天之前的搜索转化率平滑 ''' data = load_pickle(path=raw_data_path + file_name + '.pkl') shop_cvr = data.loc[data.day<test_day,['shop_id', 'is_trade']] #获得每个商品的购买数和点击数 I = shop_cvr.groupby('shop_id')['is_trade'].size().reset_index() I.columns = ['shop_id', 'shop_I'] C = shop_cvr.groupby('shop_id')['is_trade'].sum().reset_index() C.columns = ['shop_id', 'shop_C'] shop_cvr = pd.concat([I, C['shop_C']], axis=1) #平滑滤波 hyper = BayesianSmoothing(1, 1) hyper.update(shop_cvr['shop_I'].values, shop_cvr['shop_C'].values, 100, 0.00001) alpha = hyper.alpha beta = hyper.beta shop_cvr['shop_cvr_smooth'] = (shop_cvr['shop_C'] + alpha) / (shop_cvr['shop_I'] + alpha + beta) return [shop_cvr,alpha, beta]
def gen_features_cross_smooth_ctr(): ''' 贝叶斯平滑版 提取每天前些天的,分别以feature=['user_id', 'item_id', 'item_brand_id', 'shop_id']分类的,总点击次数_I,总购买次数_C,点击率_CTR 以['day', feature, I_alias, C_alias, CTR_alias]存储 文件名,【】_CTR.pkl ''' all_data = load_pickle(raw_data_path + 'all_data.pkl') for feature in tqdm([ 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level' ]): for feature2 in tqdm([ 'item_id', 'item_brand_id', 'shop_id', 'item_price_level', 'hour' ]): feature_path = feature_data_path + feature + '_' + feature2 + '_smooth_CTR.pkl' #要存放的目录 if os.path.exists(feature_path): print('found ' + feature_path) else: alpha_beta_dispaly = [] print('generating ' + feature_path) I_alias = feature + '_' + feature2 + '_smooth_I' #总点击次数 C_alias = feature + '_' + feature2 + '_smooth_C' #购买次数 CTR_alias = feature + '_' + feature2 + '_smooth_CTR' history_ctr = pd.DataFrame() for day in range(19, 26): history_data = all_data[all_data['day'] < day] I = history_data.groupby([ feature, feature2 ]).size().reset_index().rename(columns={0: I_alias}) C = history_data[history_data['is_trade'] == 1].groupby([ feature, feature2 ]).size().reset_index().rename(columns={0: C_alias}) CTR = pd.merge(I, C, how='left', on=[feature, feature2]) CTR[C_alias] = CTR[C_alias].fillna(0) hyper = BayesianSmoothing(1, 1) hyper.update(CTR[I_alias].values, CTR[C_alias].values, 1000, 0.000001) alpha = hyper.alpha beta = hyper.beta alpha_beta_dispaly.append(alpha) alpha_beta_dispaly.append(beta) print(feature) print(alpha_beta_dispaly) dump_pickle(alpha_beta_dispaly, feature_data_path + '1' + feature + '_' + feature2 + '.pkl') #存储 CTR[CTR_alias] = (CTR[C_alias] + alpha) / (CTR[I_alias] + alpha + beta) CTR['day'] = day history_ctr = history_ctr.append(CTR) print( '-----------------------------------------------------------------------' ) print(feature) print(alpha_beta_dispaly) dump_pickle(alpha_beta_dispaly, feature_data_path + '1' + feature + '_' + feature2 + '.pkl') #存储 print( '-----------------------------------------------------------------------' ) dump_pickle(history_ctr[[ 'day', feature, feature2, I_alias, C_alias, CTR_alias ]], feature_path) #存储
def gen_user_cvr_smooth(file_name='train'): ''' 获取用户前1,2,3,4天之前的转化率 ''' cols = ['user_id','item_id','shop_id','second_cate',] for col in cols: id_name = col id_I = col+'_I' id_C = col+'_C' data = load_pickle(path=raw_data_path + file_name + '.pkl') data = data[[id_name,'day','hour','is_trade']] user_cvr_all = pd.DataFrame() cvr_days = [1,2,3,4] days = list(set(data.day)) if file_name == 'train': for cvr_day in cvr_days: col_cvr = id_name + '_'+str(cvr_day) + 'day_cvr' #统计前1、2、3、4天的转化率,天数不够的都为-1 data[col_cvr] = -1 data.loc[data.day<days[cvr_day],col_cvr] = -1 for now_day in days[cvr_day:]: #得到now_day~(now-day - cvr_day)的数据 now_data = data.loc[(data.day<now_day)&(data.day>=(now_day-cvr_day)), :] I = now_data.groupby(id_name)['is_trade'].size().reset_index() I.columns = [id_name, id_I] C = now_data.groupby(id_name)['is_trade'].sum().reset_index() C.columns = [id_name, id_C] user_cvr = pd.concat([I, C[id_C]], axis=1) #平滑滤波 hyper = BayesianSmoothing(1, 1) hyper.update(user_cvr[id_I].values, user_cvr[id_C].values, iterations, eta) alpha = hyper.alpha beta = hyper.beta user_cvr[col_cvr] = (user_cvr[id_C] + alpha) / (user_cvr[id_I] + alpha + beta) user_cvr = user_cvr[[id_name, col_cvr]].set_index(id_name)[col_cvr] #把今天之前的转化率加到今天的特征中 filter_day = data.loc[data.day==now_day,id_name] data.loc[data.day==now_day,col_cvr] = filter_day.apply(lambda x: user_cvr[x] if x in user_cvr.index else -1) #把这次cvr加到总数据中 user_cvr_all = pd.concat([user_cvr_all, data[col_cvr]],axis=1) cvr_path = feature_data_path + file_name+ id_name + 'cvr_day' dump_pickle(user_cvr_all, cvr_path) else: train_data = load_pickle(path=raw_data_path + 'train' + '.pkl') train_data = train_data[[id_name,'day','hour','is_trade']] for cvr_day in cvr_days: col_cvr = id_name + '_'+str(cvr_day) + 'day_cvr' #统计前1、2、3、4天的转化率,天数不够的都为-1 data[col_cvr] = -1 now_day = days[0] #得到now_day~(now-day - cvr_day)的数据 now_data = train_data.loc[(train_data.day<now_day)&(train_data.day>=(now_day-cvr_day)), :] I = now_data.groupby(id_name)['is_trade'].size().reset_index() I.columns = [id_name, id_I] C = now_data.groupby(id_name)['is_trade'].sum().reset_index() C.columns = [id_name, id_C] user_cvr = pd.concat([I, C[id_C]], axis=1) #平滑滤波 hyper = BayesianSmoothing(1, 1) hyper.update(user_cvr[id_I].values, user_cvr[id_C].values, iterations, eta) alpha = hyper.alpha beta = hyper.beta user_cvr[col_cvr] = (user_cvr[id_C] + alpha) / (user_cvr[id_I] + alpha + beta) user_cvr = user_cvr[[id_name, col_cvr]].set_index(id_name)[col_cvr] filter_day = data.loc[data.day==now_day,id_name] #把今天之前的转化率加到今天的特征中 data.loc[data.day==now_day,col_cvr] = filter_day.apply(lambda x: user_cvr[x] if x in user_cvr.index else -1) #把这次cvr加到总数据中 user_cvr_all = pd.concat([user_cvr_all, data[col_cvr]],axis=1) cvr_path = feature_data_path + file_name+ id_name + 'cvr_day' dump_pickle(user_cvr_all, cvr_path)
def gen_cvr_smooth(file_name='train'): data = load_pickle(path=raw_data_path + file_name + '.pkl') cols = ['user_id','item_id', 'item_brand_id', 'second_cate', 'shop_id'] if file_name == 'train': #对每个特征 feat_all = None for col in cols: #对于每一天 col_feat_all = None for day in (data.day).unique(): #筛选出这天之前的数据 col_cvr_dict = dict() col_I = col+'_I' col_C = col+'_C' col_cvr_smooth = col+'_cvr_smooth' # print(day) col_cvr_series=pd.Series() if day != data.day.min(): filter_data = data.loc[data.day < day, [col, 'is_trade']] #计算转化率 I = filter_data.groupby(col)['is_trade'].size().reset_index() I.columns = [col, col_I] C = filter_data.groupby(col)['is_trade'].sum().reset_index() C.columns = [col, col_C] col_cvr = pd.concat([I, C[col_C]], axis=1) #平滑滤波 hyper = BayesianSmoothing(1, 1) hyper.update(col_cvr[col_I].values, col_cvr[col_C].values, 100, 0.00001) alpha = hyper.alpha beta = hyper.beta col_cvr[col_cvr_smooth] = (col_cvr[col_C] + alpha) / (col_cvr[col_I] + alpha + beta) # col_cvr_dict = dict(col_cvr[[col, col_cvr_smooth]].values) col_cvr_series = col_cvr[[col, col_cvr_smooth]].set_index(col)[col_cvr_smooth] #把今天之前的转化率加到今天的特征中 col_feat = data.loc[data.day==day, ['instance_id', col]] # col_feat[col_cvr_smooth] = col_feat.apply(lambda x: col_cvr_dict[x[col]] if x[col] in col_cvr_dict.keys() else -1, axis=1) col_feat[col_cvr_smooth] = col_feat.apply(lambda x: col_cvr_series[x[col]] if x[col] in col_cvr_series.index else -1, axis=1) col_feat_all = pd.concat([col_feat_all,col_feat], axis=0) #保存数据 feat_all = pd.concat([feat_all,col_feat_all[col_cvr_smooth]], axis=1) cvr_path = feature_data_path + 'train_cvr_smooth' dump_pickle(feat_all, cvr_path) else: train_data = load_pickle(path=raw_data_path + 'train' + '.pkl') #对每个特征 feat_all = None for col in cols: #筛选出这天之前的数据 col_cvr_dict = dict() col_I = col+'_I' col_C = col+'_C' col_cvr_smooth = col+'_cvr_smooth' filter_data = train_data.loc[:, [col, 'is_trade']] #计算转化率 I = filter_data.groupby(col)['is_trade'].size().reset_index() I.columns = [col, col_I] C = filter_data.groupby(col)['is_trade'].sum().reset_index() C.columns = [col, col_C] col_cvr = pd.concat([I, C[col_C]], axis=1) #平滑滤波 hyper = BayesianSmoothing(1, 1) hyper.update(col_cvr[col_I].values, col_cvr[col_C].values, 100, 0.00001) alpha = hyper.alpha beta = hyper.beta col_cvr[col_cvr_smooth] = (col_cvr[col_C] + alpha) / (col_cvr[col_I] + alpha + beta) # col_cvr_dict = dict(col_cvr[[col, col_cvr_smooth]].values) col_cvr_series = col_cvr[[col, col_cvr_smooth]].set_index(col)[col_cvr_smooth] #把今天之前的转化率加到今天的特征中 col_feat = data.loc[:, ['instance_id', col]] # col_feat[col_cvr_smooth] = col_feat.apply(lambda x: col_cvr_dict[x[col]] if x[col] in col_cvr_dict.keys() else -1, axis=1) col_feat[col_cvr_smooth] = col_feat.apply(lambda x: col_cvr_series[x[col]] if x[col] in col_cvr_series.index else -1, axis=1) feat_all = pd.concat([feat_all,col_feat[col_cvr_smooth]], axis=1) #保存数据 cvr_path = feature_data_path + 'test_cvr_smooth' dump_pickle(feat_all, cvr_path)
def gen_item_cvr_smooth(test_day, file_name='train'): ''' 获取商品前3个小时的转化率 以及历史数据下,0~7点,8到11点,12到13点,14到17点,18到19点,20点到23点的转化率 ''' item_cvr_all = pd.DataFrame() id_name = 'item_id' id_I = id_name+'_I' id_C = id_name+'_C' data = load_pickle(path=raw_data_path + file_name + '.pkl') data = data[[id_name,'day','hour','is_trade']] data['day_hour'] = (data['day']-data.day.min()) * 24 + data['hour'] hours = [[0,7],[8,11],[12,13],[14,17],[18,19],[20,23]] col_cvr = id_name + '_hour_cvr' data[col_cvr] = -1 if file_name == 'train': for hour in hours: now_data = data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])), :] I = now_data.groupby(id_name)['is_trade'].size().reset_index() I.columns = [id_name, id_I] C = now_data.groupby(id_name)['is_trade'].sum().reset_index() C.columns = [id_name, id_C] item_cvr = pd.concat([I, C[id_C]], axis=1) #平滑滤波 hyper = BayesianSmoothing(1, 1) hyper.update(item_cvr[id_I].values, item_cvr[id_C].values, iterations, eta) alpha = hyper.alpha beta = hyper.beta item_cvr[col_cvr] = (item_cvr[id_C] + alpha) / (item_cvr[id_I] + alpha + beta) item_cvr = item_cvr[[id_name, col_cvr]].set_index(id_name)[col_cvr] filter_day = data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])), id_name] data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])),col_cvr] = filter_day.apply(lambda x: item_cvr[x] if x in item_cvr.index else -1) item_cvr_all = pd.concat([item_cvr_all, data[col_cvr]],axis=1) cvr_path = feature_data_path + file_name+ id_name + 'cvr_hour' dump_pickle(item_cvr_all, cvr_path) else: train_data = load_pickle(path=raw_data_path + 'train' + '.pkl') train_data = train_data[[id_name,'day','hour','is_trade']] for hour in hours: now_data = train_data.loc[(train_data.hour<=hour[1])&(train_data.hour>=(hour[0])), :] I = now_data.groupby(id_name)['is_trade'].size().reset_index() I.columns = [id_name, id_I] C = now_data.groupby(id_name)['is_trade'].sum().reset_index() C.columns = [id_name, id_C] item_cvr = pd.concat([I, C[id_C]], axis=1) #平滑滤波 hyper = BayesianSmoothing(1, 1) hyper.update(item_cvr[id_I].values, item_cvr[id_C].values, iterations, eta) alpha = hyper.alpha beta = hyper.beta item_cvr[col_cvr] = (item_cvr[id_C] + alpha) / (item_cvr[id_I] + alpha + beta) item_cvr = item_cvr[[id_name, col_cvr]].set_index(id_name)[col_cvr] filter_day = data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])), id_name] data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])),col_cvr] = filter_day.apply(lambda x: item_cvr[x] if x in item_cvr.index else -1) item_cvr_all = pd.concat([item_cvr_all, data[col_cvr]],axis=1) cvr_path = feature_data_path + file_name+ id_name + 'cvr_hour' dump_pickle(item_cvr_all, cvr_path)