Exemplo n.º 1
0
def gen_tricks(start_day, end_day):
    '''
    生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接
    :param start_day:
    :param end_day:
    :return:
    '''
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    actions = load_pickle(raw_data_path + 'user_app_actions.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    data = addTime(data)
    data = addAd(data)

    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            df = data.loc[data.clickDay == day]
            df = add_trick(df)
            df = add_diff(df)
            df = add_install2click(df, day, actions)
            dump_pickle(
                df[[
                    'global_index', 'trick', 'first_diff', 'last_diff',
                    'install2click'
                ]], feature_path)
def gen_user_hour_click_count(update=False):
    '''
     生成所有数据的每天没小时点击统计量
     拼接键['ID_name', 'clickDay', 'clickHour']
    :param update:
    :return:
    '''
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)

    ads_feature = [
        'advertiserID', 'camgaignID', 'adID', 'creativeID', 'appID',
        'appCategory'
    ]
    context_feature = ['positionID', 'sitesetID']
    state_feature = ads_feature + context_feature

    for feature in tqdm(state_feature):
        feature_path = feature_data_path + 'user_' + feature + '_click_hour.pkl'
        if os.path.exists(feature_path):
            print('Found' + feature_path)
        else:
            print('Generation' + feature_path)
            user_feature_click_day = data.groupby(
                ['userID', 'clickDay', 'clickHour',
                 feature]).size().reset_index().rename(
                     columns={0: 'user_' + feature + '_click_hour'})
            dump_pickle(user_feature_click_day, feature_path)
Exemplo n.º 3
0
def gen_positionID_cvr_smooth(test_day):
    """
    * 新学trick:对positionID的在个别时间窗口内的ctr进行贝叶斯平滑处理
    """

    feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str(
        test_day) + '.pkl'
    if os.path.exists(feature_path):
        print('found ' + feature_path)
    else:
        print('generating ' + feature_path)
        data = load_pickle(raw_data_path + 'train.pkl')
        data = addTime(data)
        positionID_cvr = data[data.clickDay < test_day]
        I = positionID_cvr.groupby('positionID')['label'].size().reset_index(
        )  # size()的作用等于count(),即做计数计算
        I.columns = ['positionID', 'I']
        C = positionID_cvr.groupby('positionID')['label'].sum().reset_index(
        )  # 因为点击的label是1,这里做sum()等价于是求出所有的点击量
        C.columns = ['positionID', 'C']
        positionID_cvr = pd.concat([I, C['C']], axis=1)
        hyper = BayesianSmoothing(1, 1)
        hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values,
                     10000, 0.00000001)  # 传入的是展示量和点击量,以及训练的参数设定
        alpha = hyper.alpha
        beta = hyper.beta
        # 根据训练出来的alpha和beta,进行贝叶斯平滑
        positionID_cvr['positionID_cvr_smooth'] = (
            positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta)
        dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],
                    feature_path)
Exemplo n.º 4
0
def gen_ID_global_sum_count(
        last_day=27,
        stats_features=['positionID', 'creativeID', 'appID', 'adID',
                        'userID']):
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay <= last_day]
    del train, test
    gc.collect()
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)

    for feature in tqdm(stats_features):
        feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
            # continue
        print('generating ' + feature_path)
        """
        数据来自:'train.pkl和test.pkl,表示是关于展示和点击的数据
        聚合的算子的size(),表示统计的是展示量
        """
        feature_count_sum = pd.DataFrame(
            data.groupby(feature).size()).reset_index().rename(
                columns={0: feature + '_sum_count'})
        dump_pickle(feature_count_sum, feature_path)
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25):
    # train_data = pd.read_csv(raw_data_path, 'train.csv')
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    gc.collect()
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    ID_hist_cvr = None
    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(
            day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            dfCvr = data[data.clickDay < day]
            dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label')
            dfCvr = dfCvr.groupby([key], as_index=False).sum()
            dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / (
                dfCvr['label_0'] + dfCvr['label_0'] + alpha * 2)
            # dfCvr['clickDay'] = day
            sub_data = pd.merge(data.loc[data.clickDay == day,
                                         ['clickDay', key]],
                                dfCvr[[key, key + '_cvr']],
                                'left',
                                on=[
                                    key,
                                ])
            sub_data.drop_duplicates(['clickDay', key], inplace=True)
            sub_data.sort_values(['clickDay', key], inplace=True)
            dump_pickle(sub_data[['clickDay', key, key + '_cvr']],
                        feature_path)
def gen_user_start_installed_cateA():
    '''
    计算用户初始安装的各大类的app的数量
    拼接键['userID']
    :return:
    '''
    user_install = load_pickle(raw_data_path + 'user_installedapps.pkl')
    app_cate = pd.read_csv(raw_data_path + 'app_categories.csv')
    app_cate['cate_a'] = app_cate.appCategory.apply(lambda x: x // 100
                                                    if x > 100 else x)
    user_install = user_install.merge(app_cate, on='appID', how='left')
    for cate_a in tqdm(app_cate.cate_a.unique()):
        feature_path = feature_data_path + 'user_start_installed_cate_' + str(
            cate_a) + '.pkl'
        if os.path.exists(feature_path):
            print('Found ' + feature_path)
        else:
            print('Generating ' + feature_path)
            user_install_cate = user_install[user_install.cate_a == cate_a][[
                'userID', 'cate_a'
            ]]
            user_install_cate.rename(
                columns={'cate_a': 'user_start_install_cate_' + str(cate_a)},
                inplace=True)
            user_install_cate = user_install_cate.groupby(
                'userID', as_index=False).sum()
            dump_pickle(user_install_cate, feature_path)
Exemplo n.º 7
0
def gen_user_group_install():
    user_install = load_pickle(raw_data_path+'user_installedapps.pkl')
    user_info = load_pickle(raw_data_path+'user.pkl')
    user_info['age_cut_small']=pd.cut(user_info['age'],bins=[-1,0,18,25,35,45,55,np.inf],labels=False)
    user_info['education_new'] = user_info['education']
    user_info.loc[user_info.education_new==7,'education_new'] = 6
    user_info_comb = user_info[['age_cut_small','gender','education_new',]].drop_duplicates()
    user_info_comb['user_group'] = np.arange(0,user_info_comb.shape[0])
    user_info = pd.merge(user_info,user_info_comb,'left',['age_cut_small','gender','education_new',])
    user_install = pd.merge(user_install,user_info[['userID','user_group','age_cut_small','gender','education_new',]],'left','userID')
    def update_dict(row,dic):
        dic[row['appID']] += 1
    user_group_install = None
    for i,u_g in tqdm(enumerate(user_install.user_group.unique())):
        sub_install = user_install[user_install.user_group==u_g]
        install_dict = dict((k,0) for k in user_install.appID.unique())
        install_dict['user_group'] = u_g
        install_dict['age_cut_small'] = sub_install['age_cut_small'].iloc[0]
        install_dict['gender'] = sub_install['gender'].iloc[0]
        install_dict['education_new'] = sub_install['education_new'].iloc[0]
        sub_install.apply(update_dict, args=(install_dict,),axis=1,)
        if user_group_install is None:
            user_group_install = pd.DataFrame(install_dict,index=[i,])
        else:
            user_group_install = pd.concat([user_group_install,pd.DataFrame(install_dict,index=[i,])])
    dump_pickle(user_group_install,feature_data_path+'user_group_install.pkl')
Exemplo n.º 8
0
def gen_CountVector_appCategory_user_action_hour():
    '''
    拼接键['appcategory']
    :return:
    '''
    """
    **新学的trick:处理时间戳的高效方式
    - 保证时间戳的位数为 10 
    - day级别的快速运算 :data['day'] = data['clickTime'] // 1000000
    - hour级别的快速运算:data['hour'] = data['clickTime'] % 1000000 // 10000
    
    **新学的trick:时间级的feature构造
    - 使用pd.get_dummies()对时间特征进行one-hot
    - 加入目标feature,进行groupby的sum()操作
    - pickle操作
    """

    feature_path = raw_data_path + 'CountVector_appCategory_actionHour.pkl'
    if os.path.exists(feature_path):
        print('Found ' + feature_path)
    else:
        print('Generating ' + feature_path)
        user_action = pd.read_csv(raw_data_path + 'user_app_actions.csv')
        app_cate = pd.read_csv(raw_data_path + 'app_categories.csv')
        user_action = pd.merge(user_action, app_cate, 'left', 'appID')
        user_action['installHour'] = user_action['installTime'] % 1000000 // 10000
        user_action = pd.get_dummies(user_action[['appCategory', 'installHour']], columns=['installHour'])
        user_action = user_action.groupby('appCategory', as_index=False).sum()
        dump_pickle(user_action, feature_path)
Exemplo n.º 9
0
def gen_user_hist_install_cateA():
    '''
    记录截至clickDay前一天,用户安装的各个大类app总量,根据action表统计
    拼接键['userID', 'clickDay']
    :return:
    '''
    feature_path = feature_data_path + 'user_hist_install_cateA'
    if os.path.exists(feature_path):
        print('Found ' + feature_path)
    else:
        print('Generating ' + feature_path)
        user_action = pd.read_csv(raw_data_path + 'user_app_actions.csv')
        app_cate = pd.read_csv(raw_data_path + 'app_categories.csv')
        app_cate['cate_a'] = app_cate.appCategory.apply(lambda x:x//100 if x>100 else x)
        user_action = user_action.merge(app_cate[['appID', 'cate_a']], on='appID', how='left')
        user_action['installDay'] = user_action['installTime']//1000
        user_action = pd.get_dummies(user_action[['userID', 'cate_a', 'installDay']], prefix='user_hist_install_cateA', columns=['cate_a'])
        stats_columns = ['user_hist_install_cateA_' + str(i) for i in range(0,6)]
        user_hist_install_cateA = None
        for clickday in tqdm(range(17, 32)):
            last_day_acc_install = user_action[user_action.installDay < clickday][['userID'] + stats_columns]
            last_day_acc_install = last_day_acc_install.groupby('userID', as_index=False).sum()
            last_day_acc_install['clickDay'] = clickday
            if user_hist_install_cateA is None:
                user_hist_install_cateA = last_day_acc_install
            else:
                user_hist_install_cateA = pd.concat([user_hist_install_cateA, last_day_acc_install], axis=0)
        dump_pickle(user_hist_install_cateA, feature_path)
def gen_CountVector_ID_user_clicks(ID_name,
                                   last_day=27,
                                   ID_describe_feature_names=[
                                       'age_cut', 'gender', 'education',
                                       'marriageStatus', 'haveBaby'
                                   ],
                                   drop_na=False):
    '''
     生成根据train和test表计算的ID_name计数描述向量
     拼接键 [ID_name]
    :param ID_name:
    :param last_day:
    :param gen_CountVector_ID_user_clicks:
    :param drop_na:
    :return:
    '''
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay <= last_day]
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)
    data = data[['userID', ID_name]]
    user_info = pd.read_csv(raw_data_path + 'user.csv')

    user_info['age_cut'] = pd.cut(user_info['age'],
                                  bins=[-1, 0, 18, 25, 35, 45, 55, np.inf],
                                  labels=False)
    user_info.loc[user_info.education == 7, 'education'] = 6

    user_info['hometown_province'] = user_info['hometown'].apply(
        lambda x: x // 100)
    user_info['residence_province'] = user_info['residence'].apply(
        lambda x: x // 100)

    for feature in tqdm(ID_describe_feature_names):
        feature_path = feature_data_path + 'CountVector_' + ID_name + '_user_clicks_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if drop_na:
            feature_path += '.no_na'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
            continue
        print('generating ' + feature_path)
        prefix_name = ID_name + '_user_clicks_' + feature
        sub_user_info = pd.get_dummies(user_info[['userID', feature]],
                                       columns=[feature],
                                       prefix=prefix_name)
        if drop_na:
            sub_user_info.drop([prefix_name + '_0'], axis=1, inplace=True)
        data = pd.merge(data, sub_user_info, 'left', 'userID')
        dummy_features = sub_user_info.columns.tolist()
        dummy_features.remove('userID')
        ID_describe_feature = data[[ID_name] + dummy_features].groupby(
            [ID_name], as_index=False).sum()
        data.drop(dummy_features, axis=1, inplace=True)
        dump_pickle(ID_describe_feature, feature_path)
def csv_pkl(csv_name_without_suffix, protocol=None):
    pkl_path = raw_data_path + csv_name_without_suffix + '.pkl'
    if not os.path.exists(pkl_path):
        print('generating' + pkl_path)
        data = pd.read_csv(raw_data_path + csv_name_without_suffix + '.csv')
        dump_pickle(data, pkl_path, protocol=protocol)
    else:
        print('found' + pkl_path)
def generate_stats_feature():
    '''
    输入train和test,进行concat后,添加用户点击数据的统计特征
    :return:
    '''
    feature_path = feature_data_path + 'UserClickStats.pkl'
    if os.path.exists(feature_path):
        print('Found', feature_path)
    else:
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        data = train.append(test)
        del train, test
        gc.collect()
        data = addTime(data)
        data = addAd(data)
        data = addPosition(data)
        data = addAppCategories(data)
        data = add_user_day_click(data)
        data = add_user_day_click_count(
            data, feature_list=['camgaignID', 'adID', 'appID', 'sitesetID'])
        # data = add_user_day_hour_count(data)
        # train_origin_features = train.columns.values.tolist()
        # test_origin_features = test.columns.values.tolist()

        feature_names = [
            'user_adID_click_day_mean',  # 有些统计特征没包括进来
            'user_adID_click_day_min',
            'user_camgaignID_click_day_min',
            'user_appID_click_day_mean',
            'user_appID_click_day_max',
            'user_appID_click_day_min',
            'user_sitesetID_click_day_mean',
            'user_sitesetID_click_day_max',
            'user_sitesetID_click_day_min',
            'user_click_day_mean',
            'user_click_day_max',
            'user_click_day_min'
        ]

        print('Generating', feature_path)
        columns_day = [
            'user_adID_click_day', 'user_camgaignID_click_day',
            'user_appID_click_day', 'user_sitesetID_click_day',
            'user_click_day'
        ]
        columns_hour = [
            'user_adID_click_hour', 'user_camgaignID_click_hour',
            'user_appID_click_hour', 'user_sitesetID_click_hour'
        ]
        sub_feature = ['userID', 'clickTime']
        # data = pd.concat([train[sub_feature+columns_day+columns_hour],test[sub_feature+columns_day+columns_hour]])
        for col in tqdm(columns_day):
            data = gen_click_stats(data, col)
        # for col in tqdm(columns_day):
        #     data = add
        data = data[feature_names + ['userID']].drop_duplicates(['userID'])
        dump_pickle(data, feature_path)
def gen_global_index():
    train = pd.read_csv(raw_data_path + 'train.csv')
    test = pd.read_csv(raw_data_path + 'test.csv')
    all_data = train.append(test)
    all_data['global_index'] = np.arange(0, all_data.shape[0])
    train = all_data.iloc[0:train.shape[0], :]
    test = all_data.iloc[train.shape[0]:, :]
    dump_pickle(train, raw_data_path + 'train.pkl')
    dump_pickle(test, raw_data_path + 'test.pkl')
def gen_CountVector_appID_user_installed(appID_describe_feature_names=[
    'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby',
    'hometown_province', 'residence_province'
]):
    '''
    生成根据install表计算的appID计数描述向量
    :param appID_describe_feature_names:
    :return:
    '''
    user_install = load_pickle(raw_data_path + 'user_installedapps.pkl')
    user_info = pd.read_csv(raw_data_path + 'user.csv')
    user_info['age_cut'] = pd.cut(user_info['age'],
                                  bins=[-1, 0, 18, 25, 35, 45, 55, 65, np.inf],
                                  labels=False)
    user_info['hometown_province'] = user_info['hometown'].apply(
        lambda x: x // 100)
    user_info['residence_province'] = user_info['residence'].apply(
        lambda x: x // 100)

    for feature in tqdm(appID_describe_feature_names):
        feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + '.pkl'
        if os.path.exists(feature_path):
            print('Found ' + feature_path)
        else:
            print('Generating ' + feature_path)
            sub_user_info = pd.get_dummies(user_info[['userID', feature]],
                                           columns=[feature],
                                           prefix='appID_installed_' +
                                           feature)  # 进行独热编码
            user_install = pd.merge(user_install,
                                    sub_user_info,
                                    on='userID',
                                    how='left')
            dummy_features = sub_user_info.columns.tolist()
            dummy_features.remove('userID')
            app_describe_feature = None
            for dummy_feature in tqdm(dummy_features):
                app_feature_installed = user_install[[
                    'appID', dummy_feature
                ]].groupby('appID', as_index=False).sum()
                if app_describe_feature is None:
                    app_describe_feature = app_feature_installed
                else:
                    app_describe_feature = pd.concat([
                        app_describe_feature,
                        app_feature_installed[[dummy_feature]]
                    ],
                                                     axis=1)
                user_install.drop(dummy_feature, inplace=True, axis=1)
            dump_pickle(app_describe_feature, feature_path)
def addAd(data):
    '''
     拼接原始ad特征
    :param data:
    :return:
    '''
    feature_path = raw_data_path + 'ad.pkl'
    ad_feature = ['adID', 'camgaignID', 'creativeID', 'advertiserID', 'appID', 'appPlatform'] #ad.csv的所有字段
    if os.path.exists(feature_path):
        ad = load_pickle(feature_path)
    else:
        ad = pd.read_csv(raw_data_path + 'ad.csv')
        dump_pickle(ad, feature_path)
    return pd.merge(data, ad[ad_feature], on='creativeID', how='left')
def addPosition(data):
    '''
     拼接原始position特征
    :param data:
    :return:
    '''
    feature_path = raw_data_path + 'position.pkl'
    position_feature = ['positionID', 'sitesetID', 'positionType']
    if os.path.exists(feature_path):
        position = load_pickle(feature_path)
    else:
        position = pd.read_csv(raw_data_path + 'position.csv')
        dump_pickle(position, feature_path)

    return pd.merge(data, position[position_feature], on='positionID', how='left')
def gen_user_day_click():
    feature_path = feature_data_path + 'user_day_clicks.pkl'
    if os.path.exists(feature_path):
        print('Found' + feature_path)
    else:
        print('Generating' + feature_path)
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        all_data = train.append(test)
        all_data = addTime(all_data)
        user_click_day = pd.DataFrame(
            all_data.groupby([
                'clickDay', 'userID'
            ]).size()).reset_index().rename(columns={0: 'user_click_day'})
        dump_pickle(user_click_day, feature_path)
Exemplo n.º 18
0
def gen_app_start_installed():
    '''
    记录第一天之前各个appID被记录的安装数量
    拼接键['appID']
    :return:
    '''
    feature_path = feature_data_path + 'app_start_installed.pkl'
    if os.path.exists(feature_path):
        print('Found:' + feature_path)
    else:
        print('Generating ' + feature_path)
        user_install = load_pickle(raw_data_path + 'user_installedapps.pkl')
        app_start_sum = user_install.groupby('appID').size().reset_index().rename(columns={0:'app_start_install_num'})
        del user_install
        gc.collect()
        dump_pickle(app_start_sum, feature_path)
def gen_CountVector_appCategory_user_action_hour():
    '''
    拼接键['appcategory']
    :return:
    '''
    feature_path = raw_data_path + 'CountVector_appCategory_actionHour.pkl'
    if os.path.exists(feature_path):
        print('Found ' + feature_path)
    else:
        print('Generating ' + feature_path)
        user_action = pd.read_csv(raw_data_path + 'user_app_actions.csv')
        app_cate = pd.read_csv(raw_data_path + 'app_categories.csv')
        user_action = pd.merge(user_action, app_cate, 'left', 'appID')
        user_action[
            'installHour'] = user_action['installTime'] % 1000000 // 10000
        user_action = pd.get_dummies(
            user_action[['appCategory', 'installHour']],
            columns=['installHour'])
        user_action = user_action.groupby('appCategory', as_index=False).sum()
        dump_pickle(user_action, feature_path)
def gen_user_day_click():
    feature_path = feature_data_path + 'user_day_clicks.pkl'
    if os.path.exists(feature_path):
        print('Found' + feature_path)
    else:
        print('Generating' + feature_path)
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        all_data = train.append(test)
        all_data = addTime(all_data)  # 添加一些时间维度
        """
        .size(): 类似于count(), 统计个数
        .reset_index().rename(columns={0:'user_click_day'}): 目的只是为了重命名列名,将聚合统计的结果命名为'user_click_day'
        """
        user_click_day = pd.DataFrame(
            all_data.groupby([
                'clickDay', 'userID'
            ]).size()).reset_index().rename(columns={0: 'user_click_day'})

        # pickle的过程类似于spark的持久化
        dump_pickle(user_click_day, feature_path)
Exemplo n.º 21
0
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25):
    """
    * 新学trick:对传入的时间窗口的每一天的ctr做Laplace Smoothing
    """
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    gc.collect()
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    ID_hist_cvr = None
    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(
            day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            dfCvr = data[data.clickDay < day]
            dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label')
            dfCvr = dfCvr.groupby([key], as_index=False).sum()
            """
            - 这里做的是Laplace Smoothing
            - 参见https://blog.csdn.net/bbbeoy/article/details/71249316
            """
            dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / (
                dfCvr['label_1'] + dfCvr['label_0'] + alpha * 2)
            sub_data = pd.merge(data.loc[data.clickDay == day,
                                         ['clickDay', key]],
                                dfCvr[[key, key + '_cvr']],
                                'left',
                                on=[
                                    key,
                                ])
            sub_data.drop_duplicates(['clickDay', key], inplace=True)
            sub_data.sort_values(['clickDay', key], inplace=True)
            dump_pickle(sub_data[['clickDay', key, key + '_cvr']],
                        feature_path)
Exemplo n.º 22
0
def gen_positionID_cvr_smooth(test_day):
    feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str(
        test_day) + '.pkl'
    if os.path.exists(feature_path):
        print('found ' + feature_path)
    else:
        print('generating ' + feature_path)
        data = load_pickle(raw_data_path + 'train.pkl')
        data = addTime(data)
        positionID_cvr = data[data.clickDay < test_day]
        I = positionID_cvr.groupby('positionID')['label'].size().reset_index()
        I.columns = ['positionID', 'I']
        C = positionID_cvr.groupby('positionID')['label'].sum().reset_index()
        C.columns = ['positionID', 'C']
        positionID_cvr = pd.concat([I, C['C']], axis=1)
        hyper = BayesianSmoothing(1, 1)
        hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values,
                     10000, 0.00000001)
        alpha = hyper.alpha
        beta = hyper.beta
        positionID_cvr['positionID_cvr_smooth'] = (
            positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta)
        dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],
                    feature_path)

def gen_demo_result():
    test = pd.read_csv(raw_data_path + 'test.csv')
    test = test[['instanceID', 'label']]
    test.rename(columns={'label': 'prob'}, inplace=True)
    if not os.path.exists(result_path):
        os.mkdir(result_path)
    test.to_csv(result_path + 'demo_result.csv', index=False)


if __name__ == '__main__':
    gen_global_index()
    train = load_pickle(raw_data_path + 'train.pkl')
    train = train[train.clickTime >= 17000000]  #丢弃16号的数据
    dump_pickle(train, raw_data_path + 'train.pkl')

    csv_pkl('ad')
    csv_pkl('position')
    csv_pkl('app_categories')
    csv_pkl('test')
    csv_pkl('user_app_actions')
    csv_pkl('user')
    csv_pkl('user_installedapps', protocol=4)

    gen_demo_result()

    if not os.path.exists(feature_data_path):
        os.mkdir(feature_data_path)
    if not os.path.exists(cache_pkl_path):
        os.mkdir(cache_pkl_path)