def gen_ID_global_sum_count( last_day=27, stats_features=['positionID', 'creativeID', 'appID', 'adID', 'userID']): train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay <= last_day] del train, test gc.collect() data = addAd(data) data = addPosition(data) data = addAppCategories(data) for feature in tqdm(stats_features): feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str( last_day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) #continue print('generating ' + feature_path) feature_count_sum = pd.DataFrame( data.groupby(feature).size()).reset_index().rename( columns={0: feature + '_sum_count'}) dump_pickle(feature_count_sum, feature_path)
def gen_tricks(start_day, end_day): """ 生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接 """ train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') actions = load_pickle(raw_data_path + 'user_app_actions.pkl') data = train_data.append(test_data) del train_data, test_data data = addTime(data) data = addAd(data) for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) df = data.loc[data.clickDay == day] df = add_trick(df) df = add_diff(df) df = add_install2click(df, day, actions) dump_pickle( df[[ 'global_index', 'trick', 'first_diff', 'last_diff', 'install2click' ]], feature_path)
def load_data(start_day=23, end_day=26, load_test=False): """ 读取基本表拼接后的数据 test表load_test = True """ if load_test == True: trans_train_path = feature_data_path + 'trans_test_' + str( start_day) + '_' + str(end_day) + '.pkl' raw_train_path = raw_data_path + 'test.pkl' else: trans_train_path = feature_data_path + 'trans_train_' + str( start_day) + '_' + str(end_day) + '.pkl' raw_train_path = raw_data_path + 'train.pkl' if os.path.exists(trans_train_path): print('found ' + trans_train_path) train = pickle.load(open(trans_train_path, 'rb')) else: print('generating ' + trans_train_path) train = load_pickle(raw_train_path) train = addTime(train) train = train[(train.clickDay >= start_day) & (train.clickDay <= end_day)] train = addAd(train) train = addPosition(train) train = addAppCategories(train) train = addUserInfo(train) dump_pickle(train, trans_train_path) return train
def gen_hist_cvr_smooth(start_day,end_day, key, alpha=0.25): train_data = load_pickle(raw_data_path+'train.pkl') test_date = load_pickle(raw_data_path+'test.pkl') data = train_data.append(test_date) del train_data,test_date gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) ID_hist_cvr = None for day in tqdm(np.arange(start_day,end_day+1)): feature_path = feature_data_path + key +'_histcvr_smooth_day_'+str(day)+'.pkl' if os.path.exists(feature_path): print('found '+feature_path) else: print('generating '+feature_path) dfCvr = data[data.clickDay < day] dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') dfCvr = dfCvr.groupby([key], as_index=False).sum() dfCvr[key+'_cvr'] = (dfCvr['label_1'] + alpha) / (dfCvr['label_0'] + dfCvr['label_1'] + alpha*2) #dfCvr['clickDay'] = day sub_data = pd.merge(data.loc[data.clickDay==day,['clickDay',key]],dfCvr[[key,key+'_cvr']],'left',on=[key,]) sub_data.drop_duplicates(['clickDay',key],inplace=True) sub_data.sort_values(['clickDay',key],inplace=True) dump_pickle(sub_data[['clickDay',key,key+'_cvr']],feature_path)
def gen_positionID_cvr_smooth(test_day): feature_path = feature_data_path + 'positionID_cvr_smooth_day_'+str(test_day)+'.pkl' if os.path.exists(feature_path): print('found '+feature_path) else: print('generating '+feature_path) data = load_pickle(raw_data_path+'train.pkl') data = addTime(data) positionID_cvr = data[data.clickDay < test_day] I = positionID_cvr.groupby('positionID')['label'].size().reset_index() I.columns = ['positionID', 'I'] C = positionID_cvr.groupby('positionID')['label'].sum().reset_index() C.columns = ['positionID', 'C'] positionID_cvr = pd.concat([I, C['C']], axis=1) hyper = BayesianSmoothing(1, 1) hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values, 10000, 0.00000001) alpha = hyper.alpha beta = hyper.beta positionID_cvr['positionID_cvr_smooth'] = (positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta) dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],feature_path)
def generate_click_trick(): #df['origin_index'] = df.index feature_path = feature_data_path + 'global_tricks.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: train = pd.read_pickle(raw_data_path + 'train.pkl') test = pd.read_pickle(raw_data_path + 'test.pkl') df = train.append(test) df = df[[ 'global_index', 'creativeID', 'userID', 'label', 'clickTime', ]] del train, test df = addTime(df) gc.collect() uct_cnt = df.groupby(['userID', 'creativeID']).size().reset_index() uct_cnt.rename(columns={0: 'global_uct_cnt'}, inplace=True) df = pd.merge(df, uct_cnt, how='left', on=['userID', 'creativeID']) df_1 = df.sort_values(by=['userID', 'clickTime'], ascending=True) first = df_1.drop_duplicates('userID') first['global_first'] = 1 first = first[['userID', 'clickTime', 'global_first']] df = pd.merge(df, first, how='left', on=['userID', 'clickTime']) df_2 = df.sort_values(by=['userID', 'clickTime'], ascending=False) last = df_2.drop_duplicates('userID') last['global_last'] = 1 last = last[['userID', 'clickTime', 'global_last']] df = pd.merge(df, last, how='left', on=['userID', 'clickTime']) pd.to_pickle( df[[ 'clickDay', 'global_uct_cnt', 'global_first', 'global_last', ]], feature_path)
def gen_CountVector_ID_user_clicks(ID_name,last_day=27,ID_describe_feature_names=['age_cut','gender','education','marriageStatus','haveBaby',],drop_na = False): """ 生成根据train和test表计算的ID_name计数描述向量,可以进行其他后处理 拼接键[ID_name] """ train = load_pickle(raw_data_path+'train.pkl') test = load_pickle(raw_data_path+'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay<=last_day] data = addAd(data) data = addPosition(data) data = addAppCategories(data) data = data[['userID',ID_name]] user_info = pd.read_csv(raw_data_path+'user.csv') user_info['age_cut']=pd.cut(user_info['age'],bins=[-1,0,18,25,35,45,55,np.inf],labels=False) user_info.loc[user_info.education==7,'education'] = 6 user_info['hometown_province'] = user_info['hometown'].apply(lambda x: x//100) user_info['residence_province'] = user_info['residence'].apply(lambda x: x//100) for feature in tqdm(ID_describe_feature_names): feature_path = feature_data_path +'CountVector_'+ID_name+'_user_clicks_'+feature+'_lastday'+str(last_day)+'.pkl' if drop_na: feature_path += '.no_na' if os.path.exists(feature_path): print('found '+feature_path) continue print('generating '+feature_path) prefix_name = ID_name+'_user_clicks_'+feature sub_user_info =pd.get_dummies(user_info[['userID',feature]],columns=[feature],prefix=prefix_name) if drop_na: sub_user_info.drop([prefix_name+'_0'],axis=1,inplace=True) data = pd.merge(data,sub_user_info,'left','userID') dummy_features= sub_user_info.columns.tolist() dummy_features.remove('userID') ID_describe_feature = data[[ID_name]+dummy_features].groupby([ID_name],as_index=False).sum() data.drop(dummy_features,axis=1,inplace=True) dump_pickle(ID_describe_feature,feature_path)