Пример #1
0
def gen_cate_property_cvr(test_day, data):
    """
    生成test_day之前全部cate-property对的转化率
    """
    cate_prop_dict_path = cache_pkl_path + 'cate_prop_cvr_day_{0}_dict.pkl'.format(
        test_day)
    if os.path.exists(cate_prop_dict_path):
        print('found ' + cate_prop_dict_path)
        return load_pickle(cate_prop_dict_path)
    cate_prop_cvr = []
    if test_day != 18:
        real_data = data
        real_data = real_data[real_data['day'] < test_day]
        trade_data = real_data[real_data['is_trade'] == 1]
        all_cate_prop_cnt = gen_sorted_cate_property(real_data)
        trade_cate_prop_cnt = gen_sorted_cate_property(trade_data)
        cate_prop_cvr = trade_cate_prop_cnt
        #平滑滤波
        all_cate_df = pd.DataFrame(all_cate_prop_cnt,
                                   columns=['cate_prop', 'I'])
        trade_cate_df = pd.DataFrame(trade_cate_prop_cnt,
                                     columns=['cate_prop', 'C'])
        all_cate_df = all_cate_df.merge(trade_cate_df,
                                        on='cate_prop',
                                        how='outer')
        all_cate_df.fillna(0, inplace=True)

        hyper = BayesianSmoothing(1, 1)
        hyper.update(all_cate_df['I'].values, all_cate_df['C'].values, 100,
                     0.00001)
        alpha = hyper.alpha
        beta = hyper.beta
        all_cate_df['cate_prop_cvr_smooth'] = (all_cate_df['C'] + alpha) / (
            all_cate_df['I'] + alpha + beta)

        cate_prop_cvr = all_cate_df[['cate_prop',
                                     'cate_prop_cvr_smooth']].values


#        #不平滑
#        all_cate_prop_cnt = dict(all_cate_prop_cnt)
#        for i, cate_prop in enumerate(cate_prop_cvr):
#            cate_prop_cvr[i] = [cate_prop_cvr[i][0], 1.0*cate_prop[1]/(all_cate_prop_cnt[cate_prop[0]]+1)]
    return cate_prop_cvr
def gen_positionID_cvr_smooth(test_day):
    feature_path = feature_data_path + 'positionID_cvr_smooth_day_'+str(test_day)+'.pkl'
    if os.path.exists(feature_path):
        print('found '+feature_path)
    else:
        print('generating '+feature_path)
        data = load_pickle(raw_data_path+'train.pkl')
        data = addTime(data)
        positionID_cvr = data[data.clickDay < test_day]
        I = positionID_cvr.groupby('positionID')['label'].size().reset_index()
        I.columns = ['positionID', 'I']
        C = positionID_cvr.groupby('positionID')['label'].sum().reset_index()
        C.columns = ['positionID', 'C']
        positionID_cvr = pd.concat([I, C['C']], axis=1)
        hyper = BayesianSmoothing(1, 1)
        hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values, 10000, 0.00000001)
        alpha = hyper.alpha
        beta = hyper.beta
        positionID_cvr['positionID_cvr_smooth'] = (positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta)
        dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],feature_path)
Пример #3
0
def gen_shop_cvr_smooth(test_day, file_name='train'):
    '''
    获取店铺这天之前的搜索转化率平滑
    '''
    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    shop_cvr = data.loc[data.day<test_day,['shop_id', 'is_trade']]
    
    #获得每个商品的购买数和点击数
    I = shop_cvr.groupby('shop_id')['is_trade'].size().reset_index()
    I.columns = ['shop_id', 'shop_I']
    C = shop_cvr.groupby('shop_id')['is_trade'].sum().reset_index()
    C.columns = ['shop_id', 'shop_C']
    shop_cvr = pd.concat([I, C['shop_C']], axis=1)
    #平滑滤波
    hyper = BayesianSmoothing(1, 1)
    hyper.update(shop_cvr['shop_I'].values, shop_cvr['shop_C'].values, 100, 0.00001)
    alpha = hyper.alpha
    beta = hyper.beta
    shop_cvr['shop_cvr_smooth'] = (shop_cvr['shop_C'] + alpha) / (shop_cvr['shop_I'] + alpha + beta)
    
    return [shop_cvr,alpha, beta]
Пример #4
0
def gen_features_cross_smooth_ctr():
    '''
    贝叶斯平滑版
    提取每天前些天的,分别以feature=['user_id', 'item_id', 'item_brand_id', 'shop_id']分类的,总点击次数_I,总购买次数_C,点击率_CTR
    以['day', feature, I_alias, C_alias, CTR_alias]存储
    文件名,【】_CTR.pkl
    '''
    all_data = load_pickle(raw_data_path + 'all_data.pkl')

    for feature in tqdm([
            'user_gender_id', 'user_age_level', 'user_occupation_id',
            'user_star_level'
    ]):

        for feature2 in tqdm([
                'item_id', 'item_brand_id', 'shop_id', 'item_price_level',
                'hour'
        ]):

            feature_path = feature_data_path + feature + '_' + feature2 + '_smooth_CTR.pkl'  #要存放的目录
            if os.path.exists(feature_path):
                print('found ' + feature_path)
            else:

                alpha_beta_dispaly = []

                print('generating ' + feature_path)
                I_alias = feature + '_' + feature2 + '_smooth_I'  #总点击次数
                C_alias = feature + '_' + feature2 + '_smooth_C'  #购买次数
                CTR_alias = feature + '_' + feature2 + '_smooth_CTR'
                history_ctr = pd.DataFrame()
                for day in range(19, 26):

                    history_data = all_data[all_data['day'] < day]
                    I = history_data.groupby([
                        feature, feature2
                    ]).size().reset_index().rename(columns={0: I_alias})
                    C = history_data[history_data['is_trade'] == 1].groupby([
                        feature, feature2
                    ]).size().reset_index().rename(columns={0: C_alias})
                    CTR = pd.merge(I, C, how='left', on=[feature, feature2])
                    CTR[C_alias] = CTR[C_alias].fillna(0)

                    hyper = BayesianSmoothing(1, 1)
                    hyper.update(CTR[I_alias].values, CTR[C_alias].values,
                                 1000, 0.000001)
                    alpha = hyper.alpha
                    beta = hyper.beta

                    alpha_beta_dispaly.append(alpha)
                    alpha_beta_dispaly.append(beta)

                    print(feature)
                    print(alpha_beta_dispaly)
                    dump_pickle(alpha_beta_dispaly, feature_data_path + '1' +
                                feature + '_' + feature2 + '.pkl')  #存储

                    CTR[CTR_alias] = (CTR[C_alias] + alpha) / (CTR[I_alias] +
                                                               alpha + beta)
                    CTR['day'] = day
                    history_ctr = history_ctr.append(CTR)

                print(
                    '-----------------------------------------------------------------------'
                )
                print(feature)
                print(alpha_beta_dispaly)
                dump_pickle(alpha_beta_dispaly, feature_data_path + '1' +
                            feature + '_' + feature2 + '.pkl')  #存储
                print(
                    '-----------------------------------------------------------------------'
                )
                dump_pickle(history_ctr[[
                    'day', feature, feature2, I_alias, C_alias, CTR_alias
                ]], feature_path)  #存储
Пример #5
0
def gen_user_cvr_smooth(file_name='train'):
    '''
    获取用户前1,2,3,4天之前的转化率
    '''
    cols = ['user_id','item_id','shop_id','second_cate',]
    for col in cols:
        id_name = col
        id_I = col+'_I'
        id_C = col+'_C'
        
        data = load_pickle(path=raw_data_path + file_name + '.pkl')
        data = data[[id_name,'day','hour','is_trade']]
        
        user_cvr_all = pd.DataFrame()
        cvr_days = [1,2,3,4]
        days = list(set(data.day))
        if file_name == 'train':
            
            for cvr_day in  cvr_days:
                col_cvr = id_name + '_'+str(cvr_day) + 'day_cvr'
                #统计前1、2、3、4天的转化率,天数不够的都为-1
                data[col_cvr] = -1
                data.loc[data.day<days[cvr_day],col_cvr] = -1
                for now_day in days[cvr_day:]:
                    #得到now_day~(now-day - cvr_day)的数据
                    now_data = data.loc[(data.day<now_day)&(data.day>=(now_day-cvr_day)), :]
                    I = now_data.groupby(id_name)['is_trade'].size().reset_index()
                    I.columns = [id_name, id_I]
                    C = now_data.groupby(id_name)['is_trade'].sum().reset_index()
                    C.columns = [id_name, id_C]
                    user_cvr = pd.concat([I, C[id_C]], axis=1)
                    
                    #平滑滤波
                    hyper = BayesianSmoothing(1, 1)
                    hyper.update(user_cvr[id_I].values, user_cvr[id_C].values, iterations, eta)
                    alpha = hyper.alpha
                    beta = hyper.beta
                    user_cvr[col_cvr] = (user_cvr[id_C] + alpha) / (user_cvr[id_I] + alpha + beta)
                    
                    user_cvr = user_cvr[[id_name, col_cvr]].set_index(id_name)[col_cvr]
                    
                    #把今天之前的转化率加到今天的特征中
                    filter_day = data.loc[data.day==now_day,id_name]
                    data.loc[data.day==now_day,col_cvr] = filter_day.apply(lambda x: user_cvr[x] if x in user_cvr.index else -1)
                #把这次cvr加到总数据中
                user_cvr_all = pd.concat([user_cvr_all, data[col_cvr]],axis=1)
                cvr_path = feature_data_path + file_name+ id_name + 'cvr_day'
                dump_pickle(user_cvr_all, cvr_path)
        else:
            train_data = load_pickle(path=raw_data_path + 'train' + '.pkl')
            train_data = train_data[[id_name,'day','hour','is_trade']]
            for cvr_day in  cvr_days:
                col_cvr = id_name + '_'+str(cvr_day) + 'day_cvr'
                #统计前1、2、3、4天的转化率,天数不够的都为-1
                data[col_cvr] = -1
                now_day = days[0]
                
                #得到now_day~(now-day - cvr_day)的数据
                now_data = train_data.loc[(train_data.day<now_day)&(train_data.day>=(now_day-cvr_day)), :]
                I = now_data.groupby(id_name)['is_trade'].size().reset_index()
                I.columns = [id_name, id_I]
                C = now_data.groupby(id_name)['is_trade'].sum().reset_index()
                C.columns = [id_name, id_C]
                user_cvr = pd.concat([I, C[id_C]], axis=1)
                
                #平滑滤波
                hyper = BayesianSmoothing(1, 1)
                hyper.update(user_cvr[id_I].values, user_cvr[id_C].values, iterations, eta)
                alpha = hyper.alpha
                beta = hyper.beta
                user_cvr[col_cvr] = (user_cvr[id_C] + alpha) / (user_cvr[id_I] + alpha + beta)
                
                user_cvr = user_cvr[[id_name, col_cvr]].set_index(id_name)[col_cvr]
                filter_day = data.loc[data.day==now_day,id_name]
                #把今天之前的转化率加到今天的特征中
                data.loc[data.day==now_day,col_cvr] = filter_day.apply(lambda x: user_cvr[x] if x in user_cvr.index else -1)
                #把这次cvr加到总数据中
                user_cvr_all = pd.concat([user_cvr_all, data[col_cvr]],axis=1)
                cvr_path = feature_data_path + file_name+ id_name + 'cvr_day'
                dump_pickle(user_cvr_all, cvr_path)
Пример #6
0
def gen_cvr_smooth(file_name='train'):
    
    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    cols = ['user_id','item_id', 'item_brand_id', 'second_cate', 'shop_id']
    
    if file_name == 'train':
        #对每个特征
        feat_all = None
        for col in cols:
            #对于每一天
            col_feat_all = None
            for day in (data.day).unique():
                #筛选出这天之前的数据
                col_cvr_dict = dict()
                col_I = col+'_I'
                col_C = col+'_C'
                col_cvr_smooth = col+'_cvr_smooth'
#                print(day)
                col_cvr_series=pd.Series()
                if day != data.day.min():
                    filter_data = data.loc[data.day < day, [col, 'is_trade']]
                    #计算转化率
                    I = filter_data.groupby(col)['is_trade'].size().reset_index()
                    I.columns = [col, col_I]
                    C = filter_data.groupby(col)['is_trade'].sum().reset_index()
                    C.columns = [col, col_C]
                    col_cvr = pd.concat([I, C[col_C]], axis=1)
                    #平滑滤波
                    hyper = BayesianSmoothing(1, 1)
                    hyper.update(col_cvr[col_I].values, col_cvr[col_C].values, 100, 0.00001)
                    alpha = hyper.alpha
                    beta = hyper.beta
                    col_cvr[col_cvr_smooth] = (col_cvr[col_C] + alpha) / (col_cvr[col_I] + alpha + beta)
#                    col_cvr_dict = dict(col_cvr[[col, col_cvr_smooth]].values)
                    col_cvr_series = col_cvr[[col, col_cvr_smooth]].set_index(col)[col_cvr_smooth]
                    
                    #把今天之前的转化率加到今天的特征中
                col_feat = data.loc[data.day==day, ['instance_id', col]]
#                col_feat[col_cvr_smooth] = col_feat.apply(lambda x: col_cvr_dict[x[col]] if x[col] in col_cvr_dict.keys() else -1, axis=1)
                col_feat[col_cvr_smooth] = col_feat.apply(lambda x: col_cvr_series[x[col]] if x[col] in col_cvr_series.index else -1, axis=1)
                col_feat_all = pd.concat([col_feat_all,col_feat], axis=0)
            #保存数据
            feat_all = pd.concat([feat_all,col_feat_all[col_cvr_smooth]], axis=1)
            cvr_path = feature_data_path + 'train_cvr_smooth'
            dump_pickle(feat_all, cvr_path)
    else:
        train_data = load_pickle(path=raw_data_path + 'train' + '.pkl')
        #对每个特征
        feat_all = None
        for col in cols:
            #筛选出这天之前的数据
            col_cvr_dict = dict()
            col_I = col+'_I'
            col_C = col+'_C'
            col_cvr_smooth = col+'_cvr_smooth'
            
            filter_data = train_data.loc[:, [col, 'is_trade']]
            #计算转化率
            I = filter_data.groupby(col)['is_trade'].size().reset_index()
            I.columns = [col, col_I]
            C = filter_data.groupby(col)['is_trade'].sum().reset_index()
            C.columns = [col, col_C]
            col_cvr = pd.concat([I, C[col_C]], axis=1)
            #平滑滤波
            hyper = BayesianSmoothing(1, 1)
            hyper.update(col_cvr[col_I].values, col_cvr[col_C].values, 100, 0.00001)
            alpha = hyper.alpha
            beta = hyper.beta
            col_cvr[col_cvr_smooth] = (col_cvr[col_C] + alpha) / (col_cvr[col_I] + alpha + beta)
#            col_cvr_dict = dict(col_cvr[[col, col_cvr_smooth]].values)
            col_cvr_series = col_cvr[[col, col_cvr_smooth]].set_index(col)[col_cvr_smooth]
                
                #把今天之前的转化率加到今天的特征中
            col_feat = data.loc[:, ['instance_id', col]]
#            col_feat[col_cvr_smooth] = col_feat.apply(lambda x: col_cvr_dict[x[col]] if x[col] in col_cvr_dict.keys() else -1, axis=1)
            col_feat[col_cvr_smooth] = col_feat.apply(lambda x: col_cvr_series[x[col]] if x[col] in col_cvr_series.index else -1, axis=1)
            feat_all = pd.concat([feat_all,col_feat[col_cvr_smooth]], axis=1)
        #保存数据
        cvr_path = feature_data_path + 'test_cvr_smooth'
        dump_pickle(feat_all, cvr_path)
Пример #7
0
def gen_item_cvr_smooth(test_day, file_name='train'):
    '''
    获取商品前3个小时的转化率
    以及历史数据下,0~7点,8到11点,12到13点,14到17点,18到19点,20点到23点的转化率
    '''
    item_cvr_all = pd.DataFrame()
    id_name = 'item_id'
    id_I = id_name+'_I'
    id_C = id_name+'_C'
    
    data = load_pickle(path=raw_data_path + file_name + '.pkl')
    data = data[[id_name,'day','hour','is_trade']]
    data['day_hour'] = (data['day']-data.day.min()) * 24 + data['hour']
    
    hours = [[0,7],[8,11],[12,13],[14,17],[18,19],[20,23]]
    col_cvr = id_name + '_hour_cvr'
    data[col_cvr] = -1
    if file_name == 'train':
        for hour in hours:
            now_data = data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])), :]
            I = now_data.groupby(id_name)['is_trade'].size().reset_index()
            I.columns = [id_name, id_I]
            C = now_data.groupby(id_name)['is_trade'].sum().reset_index()
            C.columns = [id_name, id_C]
            item_cvr = pd.concat([I, C[id_C]], axis=1)
            #平滑滤波
            hyper = BayesianSmoothing(1, 1)
            hyper.update(item_cvr[id_I].values, item_cvr[id_C].values, iterations, eta)
            alpha = hyper.alpha
            beta = hyper.beta
            
            item_cvr[col_cvr] = (item_cvr[id_C] + alpha) / (item_cvr[id_I] + alpha + beta)
            item_cvr = item_cvr[[id_name, col_cvr]].set_index(id_name)[col_cvr]
            filter_day = data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])), id_name]
            data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])),col_cvr] = filter_day.apply(lambda x: item_cvr[x] if x in item_cvr.index else -1)
        item_cvr_all = pd.concat([item_cvr_all, data[col_cvr]],axis=1)
        cvr_path = feature_data_path + file_name+ id_name + 'cvr_hour'
        dump_pickle(item_cvr_all, cvr_path)
    else:
        train_data = load_pickle(path=raw_data_path + 'train' + '.pkl')
        train_data = train_data[[id_name,'day','hour','is_trade']]
        for hour in hours:
            now_data = train_data.loc[(train_data.hour<=hour[1])&(train_data.hour>=(hour[0])), :]
            I = now_data.groupby(id_name)['is_trade'].size().reset_index()
            I.columns = [id_name, id_I]
            C = now_data.groupby(id_name)['is_trade'].sum().reset_index()
            C.columns = [id_name, id_C]
            item_cvr = pd.concat([I, C[id_C]], axis=1)
            #平滑滤波
            hyper = BayesianSmoothing(1, 1)
            hyper.update(item_cvr[id_I].values, item_cvr[id_C].values, iterations, eta)
            alpha = hyper.alpha
            beta = hyper.beta
            
            item_cvr[col_cvr] = (item_cvr[id_C] + alpha) / (item_cvr[id_I] + alpha + beta)
            item_cvr = item_cvr[[id_name, col_cvr]].set_index(id_name)[col_cvr]
            filter_day = data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])), id_name]
            data.loc[(data.hour<=hour[1])&(data.hour>=(hour[0])),col_cvr] = filter_day.apply(lambda x: item_cvr[x] if x in item_cvr.index else -1)
        item_cvr_all = pd.concat([item_cvr_all, data[col_cvr]],axis=1)
        cvr_path = feature_data_path + file_name+ id_name + 'cvr_hour'
        dump_pickle(item_cvr_all, cvr_path)