def gen_ID_global_sum_count(
        last_day=27,
        stats_features=['positionID', 'creativeID', 'appID', 'adID',
                        'userID']):
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay <= last_day]
    del train, test
    gc.collect()
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)

    for feature in tqdm(stats_features):
        feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
            #continue
        print('generating ' + feature_path)
        feature_count_sum = pd.DataFrame(
            data.groupby(feature).size()).reset_index().rename(
                columns={0: feature + '_sum_count'})
        dump_pickle(feature_count_sum, feature_path)
def gen_tricks(start_day, end_day):
    """
    生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接
    """
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    actions = load_pickle(raw_data_path + 'user_app_actions.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    data = addTime(data)
    data = addAd(data)

    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            df = data.loc[data.clickDay == day]
            df = add_trick(df)
            df = add_diff(df)
            df = add_install2click(df, day, actions)
            dump_pickle(
                df[[
                    'global_index', 'trick', 'first_diff', 'last_diff',
                    'install2click'
                ]], feature_path)
Пример #3
0
def load_data(start_day=23, end_day=26, load_test=False):
    """
    读取基本表拼接后的数据
    test表load_test = True
    """
    if load_test == True:
        trans_train_path = feature_data_path + 'trans_test_' + str(
            start_day) + '_' + str(end_day) + '.pkl'
        raw_train_path = raw_data_path + 'test.pkl'
    else:
        trans_train_path = feature_data_path + 'trans_train_' + str(
            start_day) + '_' + str(end_day) + '.pkl'
        raw_train_path = raw_data_path + 'train.pkl'

    if os.path.exists(trans_train_path):
        print('found ' + trans_train_path)
        train = pickle.load(open(trans_train_path, 'rb'))
    else:
        print('generating ' + trans_train_path)
        train = load_pickle(raw_train_path)

        train = addTime(train)
        train = train[(train.clickDay >= start_day)
                      & (train.clickDay <= end_day)]
        train = addAd(train)
        train = addPosition(train)
        train = addAppCategories(train)
        train = addUserInfo(train)

        dump_pickle(train, trans_train_path)
    return train
def gen_hist_cvr_smooth(start_day,end_day, key, alpha=0.25):

    train_data = load_pickle(raw_data_path+'train.pkl')
    test_date = load_pickle(raw_data_path+'test.pkl')
    data = train_data.append(test_date)
    del train_data,test_date
    gc.collect()
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    ID_hist_cvr = None
    for day in tqdm(np.arange(start_day,end_day+1)):
        feature_path = feature_data_path + key +'_histcvr_smooth_day_'+str(day)+'.pkl'
        if os.path.exists(feature_path):
            print('found '+feature_path)
        else:
            print('generating '+feature_path)
            dfCvr = data[data.clickDay < day]
            dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label')
            dfCvr = dfCvr.groupby([key], as_index=False).sum()
            dfCvr[key+'_cvr'] = (dfCvr['label_1'] + alpha) / (dfCvr['label_0'] + dfCvr['label_1'] + alpha*2)
            #dfCvr['clickDay'] = day
            sub_data = pd.merge(data.loc[data.clickDay==day,['clickDay',key]],dfCvr[[key,key+'_cvr']],'left',on=[key,])
            sub_data.drop_duplicates(['clickDay',key],inplace=True)
            sub_data.sort_values(['clickDay',key],inplace=True)
            dump_pickle(sub_data[['clickDay',key,key+'_cvr']],feature_path)
def gen_positionID_cvr_smooth(test_day):
    feature_path = feature_data_path + 'positionID_cvr_smooth_day_'+str(test_day)+'.pkl'
    if os.path.exists(feature_path):
        print('found '+feature_path)
    else:
        print('generating '+feature_path)
        data = load_pickle(raw_data_path+'train.pkl')
        data = addTime(data)
        positionID_cvr = data[data.clickDay < test_day]
        I = positionID_cvr.groupby('positionID')['label'].size().reset_index()
        I.columns = ['positionID', 'I']
        C = positionID_cvr.groupby('positionID')['label'].sum().reset_index()
        C.columns = ['positionID', 'C']
        positionID_cvr = pd.concat([I, C['C']], axis=1)
        hyper = BayesianSmoothing(1, 1)
        hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values, 10000, 0.00000001)
        alpha = hyper.alpha
        beta = hyper.beta
        positionID_cvr['positionID_cvr_smooth'] = (positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta)
        dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],feature_path)
def generate_click_trick():
    #df['origin_index'] = df.index
    feature_path = feature_data_path + 'global_tricks.pkl'
    if os.path.exists(feature_path):
        print('found ' + feature_path)
    else:
        train = pd.read_pickle(raw_data_path + 'train.pkl')
        test = pd.read_pickle(raw_data_path + 'test.pkl')
        df = train.append(test)
        df = df[[
            'global_index',
            'creativeID',
            'userID',
            'label',
            'clickTime',
        ]]
        del train, test
        df = addTime(df)
        gc.collect()
        uct_cnt = df.groupby(['userID', 'creativeID']).size().reset_index()
        uct_cnt.rename(columns={0: 'global_uct_cnt'}, inplace=True)
        df = pd.merge(df, uct_cnt, how='left', on=['userID', 'creativeID'])

        df_1 = df.sort_values(by=['userID', 'clickTime'], ascending=True)
        first = df_1.drop_duplicates('userID')
        first['global_first'] = 1
        first = first[['userID', 'clickTime', 'global_first']]
        df = pd.merge(df, first, how='left', on=['userID', 'clickTime'])

        df_2 = df.sort_values(by=['userID', 'clickTime'], ascending=False)
        last = df_2.drop_duplicates('userID')
        last['global_last'] = 1
        last = last[['userID', 'clickTime', 'global_last']]
        df = pd.merge(df, last, how='left', on=['userID', 'clickTime'])
        pd.to_pickle(
            df[[
                'clickDay',
                'global_uct_cnt',
                'global_first',
                'global_last',
            ]], feature_path)
Пример #7
0
def gen_CountVector_ID_user_clicks(ID_name,last_day=27,ID_describe_feature_names=['age_cut','gender','education','marriageStatus','haveBaby',],drop_na = False):
    """
    生成根据train和test表计算的ID_name计数描述向量,可以进行其他后处理
    拼接键[ID_name]
    """
    train = load_pickle(raw_data_path+'train.pkl')
    test = load_pickle(raw_data_path+'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay<=last_day]
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)
    data = data[['userID',ID_name]]
    user_info = pd.read_csv(raw_data_path+'user.csv')
    
    user_info['age_cut']=pd.cut(user_info['age'],bins=[-1,0,18,25,35,45,55,np.inf],labels=False)
    user_info.loc[user_info.education==7,'education'] = 6
    
    user_info['hometown_province'] = user_info['hometown'].apply(lambda x: x//100)
    user_info['residence_province'] = user_info['residence'].apply(lambda x: x//100)
    
    for feature in tqdm(ID_describe_feature_names):
        feature_path = feature_data_path +'CountVector_'+ID_name+'_user_clicks_'+feature+'_lastday'+str(last_day)+'.pkl'
        if drop_na:
            feature_path += '.no_na'
        if os.path.exists(feature_path):
            print('found '+feature_path)
            continue
        print('generating '+feature_path)
        prefix_name = ID_name+'_user_clicks_'+feature
        sub_user_info =pd.get_dummies(user_info[['userID',feature]],columns=[feature],prefix=prefix_name)
        if drop_na:
            sub_user_info.drop([prefix_name+'_0'],axis=1,inplace=True)
        data = pd.merge(data,sub_user_info,'left','userID')
        dummy_features= sub_user_info.columns.tolist()
        dummy_features.remove('userID')
        ID_describe_feature = data[[ID_name]+dummy_features].groupby([ID_name],as_index=False).sum()
        data.drop(dummy_features,axis=1,inplace=True)
        dump_pickle(ID_describe_feature,feature_path)