def gen_tricks(start_day, end_day): ''' 生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接 :param start_day: :param end_day: :return: ''' train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') actions = load_pickle(raw_data_path + 'user_app_actions.pkl') data = train_data.append(test_data) del train_data, test_data data = addTime(data) data = addAd(data) for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) df = data.loc[data.clickDay == day] df = add_trick(df) df = add_diff(df) df = add_install2click(df, day, actions) dump_pickle( df[[ 'global_index', 'trick', 'first_diff', 'last_diff', 'install2click' ]], feature_path)
def gen_user_hour_click_count(update=False): ''' 生成所有数据的每天没小时点击统计量 拼接键['ID_name', 'clickDay', 'clickHour'] :param update: :return: ''' train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = addAd(data) data = addPosition(data) data = addAppCategories(data) ads_feature = [ 'advertiserID', 'camgaignID', 'adID', 'creativeID', 'appID', 'appCategory' ] context_feature = ['positionID', 'sitesetID'] state_feature = ads_feature + context_feature for feature in tqdm(state_feature): feature_path = feature_data_path + 'user_' + feature + '_click_hour.pkl' if os.path.exists(feature_path): print('Found' + feature_path) else: print('Generation' + feature_path) user_feature_click_day = data.groupby( ['userID', 'clickDay', 'clickHour', feature]).size().reset_index().rename( columns={0: 'user_' + feature + '_click_hour'}) dump_pickle(user_feature_click_day, feature_path)
def gen_positionID_cvr_smooth(test_day): """ * 新学trick:对positionID的在个别时间窗口内的ctr进行贝叶斯平滑处理 """ feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str( test_day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) data = load_pickle(raw_data_path + 'train.pkl') data = addTime(data) positionID_cvr = data[data.clickDay < test_day] I = positionID_cvr.groupby('positionID')['label'].size().reset_index( ) # size()的作用等于count(),即做计数计算 I.columns = ['positionID', 'I'] C = positionID_cvr.groupby('positionID')['label'].sum().reset_index( ) # 因为点击的label是1,这里做sum()等价于是求出所有的点击量 C.columns = ['positionID', 'C'] positionID_cvr = pd.concat([I, C['C']], axis=1) hyper = BayesianSmoothing(1, 1) hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values, 10000, 0.00000001) # 传入的是展示量和点击量,以及训练的参数设定 alpha = hyper.alpha beta = hyper.beta # 根据训练出来的alpha和beta,进行贝叶斯平滑 positionID_cvr['positionID_cvr_smooth'] = ( positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta) dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']], feature_path)
def gen_ID_global_sum_count( last_day=27, stats_features=['positionID', 'creativeID', 'appID', 'adID', 'userID']): train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay <= last_day] del train, test gc.collect() data = addAd(data) data = addPosition(data) data = addAppCategories(data) for feature in tqdm(stats_features): feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str( last_day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) # continue print('generating ' + feature_path) """ 数据来自:'train.pkl和test.pkl,表示是关于展示和点击的数据 聚合的算子的size(),表示统计的是展示量 """ feature_count_sum = pd.DataFrame( data.groupby(feature).size()).reset_index().rename( columns={0: feature + '_sum_count'}) dump_pickle(feature_count_sum, feature_path)
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25): # train_data = pd.read_csv(raw_data_path, 'train.csv') train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') data = train_data.append(test_data) del train_data, test_data gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) ID_hist_cvr = None for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str( day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) dfCvr = data[data.clickDay < day] dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') dfCvr = dfCvr.groupby([key], as_index=False).sum() dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / ( dfCvr['label_0'] + dfCvr['label_0'] + alpha * 2) # dfCvr['clickDay'] = day sub_data = pd.merge(data.loc[data.clickDay == day, ['clickDay', key]], dfCvr[[key, key + '_cvr']], 'left', on=[ key, ]) sub_data.drop_duplicates(['clickDay', key], inplace=True) sub_data.sort_values(['clickDay', key], inplace=True) dump_pickle(sub_data[['clickDay', key, key + '_cvr']], feature_path)
def gen_user_start_installed_cateA(): ''' 计算用户初始安装的各大类的app的数量 拼接键['userID'] :return: ''' user_install = load_pickle(raw_data_path + 'user_installedapps.pkl') app_cate = pd.read_csv(raw_data_path + 'app_categories.csv') app_cate['cate_a'] = app_cate.appCategory.apply(lambda x: x // 100 if x > 100 else x) user_install = user_install.merge(app_cate, on='appID', how='left') for cate_a in tqdm(app_cate.cate_a.unique()): feature_path = feature_data_path + 'user_start_installed_cate_' + str( cate_a) + '.pkl' if os.path.exists(feature_path): print('Found ' + feature_path) else: print('Generating ' + feature_path) user_install_cate = user_install[user_install.cate_a == cate_a][[ 'userID', 'cate_a' ]] user_install_cate.rename( columns={'cate_a': 'user_start_install_cate_' + str(cate_a)}, inplace=True) user_install_cate = user_install_cate.groupby( 'userID', as_index=False).sum() dump_pickle(user_install_cate, feature_path)
def gen_user_group_install(): user_install = load_pickle(raw_data_path+'user_installedapps.pkl') user_info = load_pickle(raw_data_path+'user.pkl') user_info['age_cut_small']=pd.cut(user_info['age'],bins=[-1,0,18,25,35,45,55,np.inf],labels=False) user_info['education_new'] = user_info['education'] user_info.loc[user_info.education_new==7,'education_new'] = 6 user_info_comb = user_info[['age_cut_small','gender','education_new',]].drop_duplicates() user_info_comb['user_group'] = np.arange(0,user_info_comb.shape[0]) user_info = pd.merge(user_info,user_info_comb,'left',['age_cut_small','gender','education_new',]) user_install = pd.merge(user_install,user_info[['userID','user_group','age_cut_small','gender','education_new',]],'left','userID') def update_dict(row,dic): dic[row['appID']] += 1 user_group_install = None for i,u_g in tqdm(enumerate(user_install.user_group.unique())): sub_install = user_install[user_install.user_group==u_g] install_dict = dict((k,0) for k in user_install.appID.unique()) install_dict['user_group'] = u_g install_dict['age_cut_small'] = sub_install['age_cut_small'].iloc[0] install_dict['gender'] = sub_install['gender'].iloc[0] install_dict['education_new'] = sub_install['education_new'].iloc[0] sub_install.apply(update_dict, args=(install_dict,),axis=1,) if user_group_install is None: user_group_install = pd.DataFrame(install_dict,index=[i,]) else: user_group_install = pd.concat([user_group_install,pd.DataFrame(install_dict,index=[i,])]) dump_pickle(user_group_install,feature_data_path+'user_group_install.pkl')
def gen_CountVector_appCategory_user_action_hour(): ''' 拼接键['appcategory'] :return: ''' """ **新学的trick:处理时间戳的高效方式 - 保证时间戳的位数为 10 - day级别的快速运算 :data['day'] = data['clickTime'] // 1000000 - hour级别的快速运算:data['hour'] = data['clickTime'] % 1000000 // 10000 **新学的trick:时间级的feature构造 - 使用pd.get_dummies()对时间特征进行one-hot - 加入目标feature,进行groupby的sum()操作 - pickle操作 """ feature_path = raw_data_path + 'CountVector_appCategory_actionHour.pkl' if os.path.exists(feature_path): print('Found ' + feature_path) else: print('Generating ' + feature_path) user_action = pd.read_csv(raw_data_path + 'user_app_actions.csv') app_cate = pd.read_csv(raw_data_path + 'app_categories.csv') user_action = pd.merge(user_action, app_cate, 'left', 'appID') user_action['installHour'] = user_action['installTime'] % 1000000 // 10000 user_action = pd.get_dummies(user_action[['appCategory', 'installHour']], columns=['installHour']) user_action = user_action.groupby('appCategory', as_index=False).sum() dump_pickle(user_action, feature_path)
def gen_user_hist_install_cateA(): ''' 记录截至clickDay前一天,用户安装的各个大类app总量,根据action表统计 拼接键['userID', 'clickDay'] :return: ''' feature_path = feature_data_path + 'user_hist_install_cateA' if os.path.exists(feature_path): print('Found ' + feature_path) else: print('Generating ' + feature_path) user_action = pd.read_csv(raw_data_path + 'user_app_actions.csv') app_cate = pd.read_csv(raw_data_path + 'app_categories.csv') app_cate['cate_a'] = app_cate.appCategory.apply(lambda x:x//100 if x>100 else x) user_action = user_action.merge(app_cate[['appID', 'cate_a']], on='appID', how='left') user_action['installDay'] = user_action['installTime']//1000 user_action = pd.get_dummies(user_action[['userID', 'cate_a', 'installDay']], prefix='user_hist_install_cateA', columns=['cate_a']) stats_columns = ['user_hist_install_cateA_' + str(i) for i in range(0,6)] user_hist_install_cateA = None for clickday in tqdm(range(17, 32)): last_day_acc_install = user_action[user_action.installDay < clickday][['userID'] + stats_columns] last_day_acc_install = last_day_acc_install.groupby('userID', as_index=False).sum() last_day_acc_install['clickDay'] = clickday if user_hist_install_cateA is None: user_hist_install_cateA = last_day_acc_install else: user_hist_install_cateA = pd.concat([user_hist_install_cateA, last_day_acc_install], axis=0) dump_pickle(user_hist_install_cateA, feature_path)
def gen_CountVector_ID_user_clicks(ID_name, last_day=27, ID_describe_feature_names=[ 'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby' ], drop_na=False): ''' 生成根据train和test表计算的ID_name计数描述向量 拼接键 [ID_name] :param ID_name: :param last_day: :param gen_CountVector_ID_user_clicks: :param drop_na: :return: ''' train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay <= last_day] data = addAd(data) data = addPosition(data) data = addAppCategories(data) data = data[['userID', ID_name]] user_info = pd.read_csv(raw_data_path + 'user.csv') user_info['age_cut'] = pd.cut(user_info['age'], bins=[-1, 0, 18, 25, 35, 45, 55, np.inf], labels=False) user_info.loc[user_info.education == 7, 'education'] = 6 user_info['hometown_province'] = user_info['hometown'].apply( lambda x: x // 100) user_info['residence_province'] = user_info['residence'].apply( lambda x: x // 100) for feature in tqdm(ID_describe_feature_names): feature_path = feature_data_path + 'CountVector_' + ID_name + '_user_clicks_' + feature + '_lastday' + str( last_day) + '.pkl' if drop_na: feature_path += '.no_na' if os.path.exists(feature_path): print('found ' + feature_path) continue print('generating ' + feature_path) prefix_name = ID_name + '_user_clicks_' + feature sub_user_info = pd.get_dummies(user_info[['userID', feature]], columns=[feature], prefix=prefix_name) if drop_na: sub_user_info.drop([prefix_name + '_0'], axis=1, inplace=True) data = pd.merge(data, sub_user_info, 'left', 'userID') dummy_features = sub_user_info.columns.tolist() dummy_features.remove('userID') ID_describe_feature = data[[ID_name] + dummy_features].groupby( [ID_name], as_index=False).sum() data.drop(dummy_features, axis=1, inplace=True) dump_pickle(ID_describe_feature, feature_path)
def csv_pkl(csv_name_without_suffix, protocol=None): pkl_path = raw_data_path + csv_name_without_suffix + '.pkl' if not os.path.exists(pkl_path): print('generating' + pkl_path) data = pd.read_csv(raw_data_path + csv_name_without_suffix + '.csv') dump_pickle(data, pkl_path, protocol=protocol) else: print('found' + pkl_path)
def generate_stats_feature(): ''' 输入train和test,进行concat后,添加用户点击数据的统计特征 :return: ''' feature_path = feature_data_path + 'UserClickStats.pkl' if os.path.exists(feature_path): print('Found', feature_path) else: train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) del train, test gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) data = addAppCategories(data) data = add_user_day_click(data) data = add_user_day_click_count( data, feature_list=['camgaignID', 'adID', 'appID', 'sitesetID']) # data = add_user_day_hour_count(data) # train_origin_features = train.columns.values.tolist() # test_origin_features = test.columns.values.tolist() feature_names = [ 'user_adID_click_day_mean', # 有些统计特征没包括进来 'user_adID_click_day_min', 'user_camgaignID_click_day_min', 'user_appID_click_day_mean', 'user_appID_click_day_max', 'user_appID_click_day_min', 'user_sitesetID_click_day_mean', 'user_sitesetID_click_day_max', 'user_sitesetID_click_day_min', 'user_click_day_mean', 'user_click_day_max', 'user_click_day_min' ] print('Generating', feature_path) columns_day = [ 'user_adID_click_day', 'user_camgaignID_click_day', 'user_appID_click_day', 'user_sitesetID_click_day', 'user_click_day' ] columns_hour = [ 'user_adID_click_hour', 'user_camgaignID_click_hour', 'user_appID_click_hour', 'user_sitesetID_click_hour' ] sub_feature = ['userID', 'clickTime'] # data = pd.concat([train[sub_feature+columns_day+columns_hour],test[sub_feature+columns_day+columns_hour]]) for col in tqdm(columns_day): data = gen_click_stats(data, col) # for col in tqdm(columns_day): # data = add data = data[feature_names + ['userID']].drop_duplicates(['userID']) dump_pickle(data, feature_path)
def gen_global_index(): train = pd.read_csv(raw_data_path + 'train.csv') test = pd.read_csv(raw_data_path + 'test.csv') all_data = train.append(test) all_data['global_index'] = np.arange(0, all_data.shape[0]) train = all_data.iloc[0:train.shape[0], :] test = all_data.iloc[train.shape[0]:, :] dump_pickle(train, raw_data_path + 'train.pkl') dump_pickle(test, raw_data_path + 'test.pkl')
def gen_CountVector_appID_user_installed(appID_describe_feature_names=[ 'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby', 'hometown_province', 'residence_province' ]): ''' 生成根据install表计算的appID计数描述向量 :param appID_describe_feature_names: :return: ''' user_install = load_pickle(raw_data_path + 'user_installedapps.pkl') user_info = pd.read_csv(raw_data_path + 'user.csv') user_info['age_cut'] = pd.cut(user_info['age'], bins=[-1, 0, 18, 25, 35, 45, 55, 65, np.inf], labels=False) user_info['hometown_province'] = user_info['hometown'].apply( lambda x: x // 100) user_info['residence_province'] = user_info['residence'].apply( lambda x: x // 100) for feature in tqdm(appID_describe_feature_names): feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + '.pkl' if os.path.exists(feature_path): print('Found ' + feature_path) else: print('Generating ' + feature_path) sub_user_info = pd.get_dummies(user_info[['userID', feature]], columns=[feature], prefix='appID_installed_' + feature) # 进行独热编码 user_install = pd.merge(user_install, sub_user_info, on='userID', how='left') dummy_features = sub_user_info.columns.tolist() dummy_features.remove('userID') app_describe_feature = None for dummy_feature in tqdm(dummy_features): app_feature_installed = user_install[[ 'appID', dummy_feature ]].groupby('appID', as_index=False).sum() if app_describe_feature is None: app_describe_feature = app_feature_installed else: app_describe_feature = pd.concat([ app_describe_feature, app_feature_installed[[dummy_feature]] ], axis=1) user_install.drop(dummy_feature, inplace=True, axis=1) dump_pickle(app_describe_feature, feature_path)
def addAd(data): ''' 拼接原始ad特征 :param data: :return: ''' feature_path = raw_data_path + 'ad.pkl' ad_feature = ['adID', 'camgaignID', 'creativeID', 'advertiserID', 'appID', 'appPlatform'] #ad.csv的所有字段 if os.path.exists(feature_path): ad = load_pickle(feature_path) else: ad = pd.read_csv(raw_data_path + 'ad.csv') dump_pickle(ad, feature_path) return pd.merge(data, ad[ad_feature], on='creativeID', how='left')
def addPosition(data): ''' 拼接原始position特征 :param data: :return: ''' feature_path = raw_data_path + 'position.pkl' position_feature = ['positionID', 'sitesetID', 'positionType'] if os.path.exists(feature_path): position = load_pickle(feature_path) else: position = pd.read_csv(raw_data_path + 'position.csv') dump_pickle(position, feature_path) return pd.merge(data, position[position_feature], on='positionID', how='left')
def gen_user_day_click(): feature_path = feature_data_path + 'user_day_clicks.pkl' if os.path.exists(feature_path): print('Found' + feature_path) else: print('Generating' + feature_path) train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') all_data = train.append(test) all_data = addTime(all_data) user_click_day = pd.DataFrame( all_data.groupby([ 'clickDay', 'userID' ]).size()).reset_index().rename(columns={0: 'user_click_day'}) dump_pickle(user_click_day, feature_path)
def gen_app_start_installed(): ''' 记录第一天之前各个appID被记录的安装数量 拼接键['appID'] :return: ''' feature_path = feature_data_path + 'app_start_installed.pkl' if os.path.exists(feature_path): print('Found:' + feature_path) else: print('Generating ' + feature_path) user_install = load_pickle(raw_data_path + 'user_installedapps.pkl') app_start_sum = user_install.groupby('appID').size().reset_index().rename(columns={0:'app_start_install_num'}) del user_install gc.collect() dump_pickle(app_start_sum, feature_path)
def gen_CountVector_appCategory_user_action_hour(): ''' 拼接键['appcategory'] :return: ''' feature_path = raw_data_path + 'CountVector_appCategory_actionHour.pkl' if os.path.exists(feature_path): print('Found ' + feature_path) else: print('Generating ' + feature_path) user_action = pd.read_csv(raw_data_path + 'user_app_actions.csv') app_cate = pd.read_csv(raw_data_path + 'app_categories.csv') user_action = pd.merge(user_action, app_cate, 'left', 'appID') user_action[ 'installHour'] = user_action['installTime'] % 1000000 // 10000 user_action = pd.get_dummies( user_action[['appCategory', 'installHour']], columns=['installHour']) user_action = user_action.groupby('appCategory', as_index=False).sum() dump_pickle(user_action, feature_path)
def gen_user_day_click(): feature_path = feature_data_path + 'user_day_clicks.pkl' if os.path.exists(feature_path): print('Found' + feature_path) else: print('Generating' + feature_path) train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') all_data = train.append(test) all_data = addTime(all_data) # 添加一些时间维度 """ .size(): 类似于count(), 统计个数 .reset_index().rename(columns={0:'user_click_day'}): 目的只是为了重命名列名,将聚合统计的结果命名为'user_click_day' """ user_click_day = pd.DataFrame( all_data.groupby([ 'clickDay', 'userID' ]).size()).reset_index().rename(columns={0: 'user_click_day'}) # pickle的过程类似于spark的持久化 dump_pickle(user_click_day, feature_path)
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25): """ * 新学trick:对传入的时间窗口的每一天的ctr做Laplace Smoothing """ train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') data = train_data.append(test_data) del train_data, test_data gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) ID_hist_cvr = None for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str( day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) dfCvr = data[data.clickDay < day] dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') dfCvr = dfCvr.groupby([key], as_index=False).sum() """ - 这里做的是Laplace Smoothing - 参见https://blog.csdn.net/bbbeoy/article/details/71249316 """ dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / ( dfCvr['label_1'] + dfCvr['label_0'] + alpha * 2) sub_data = pd.merge(data.loc[data.clickDay == day, ['clickDay', key]], dfCvr[[key, key + '_cvr']], 'left', on=[ key, ]) sub_data.drop_duplicates(['clickDay', key], inplace=True) sub_data.sort_values(['clickDay', key], inplace=True) dump_pickle(sub_data[['clickDay', key, key + '_cvr']], feature_path)
def gen_positionID_cvr_smooth(test_day): feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str( test_day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) data = load_pickle(raw_data_path + 'train.pkl') data = addTime(data) positionID_cvr = data[data.clickDay < test_day] I = positionID_cvr.groupby('positionID')['label'].size().reset_index() I.columns = ['positionID', 'I'] C = positionID_cvr.groupby('positionID')['label'].sum().reset_index() C.columns = ['positionID', 'C'] positionID_cvr = pd.concat([I, C['C']], axis=1) hyper = BayesianSmoothing(1, 1) hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values, 10000, 0.00000001) alpha = hyper.alpha beta = hyper.beta positionID_cvr['positionID_cvr_smooth'] = ( positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta) dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']], feature_path)
def gen_demo_result(): test = pd.read_csv(raw_data_path + 'test.csv') test = test[['instanceID', 'label']] test.rename(columns={'label': 'prob'}, inplace=True) if not os.path.exists(result_path): os.mkdir(result_path) test.to_csv(result_path + 'demo_result.csv', index=False) if __name__ == '__main__': gen_global_index() train = load_pickle(raw_data_path + 'train.pkl') train = train[train.clickTime >= 17000000] #丢弃16号的数据 dump_pickle(train, raw_data_path + 'train.pkl') csv_pkl('ad') csv_pkl('position') csv_pkl('app_categories') csv_pkl('test') csv_pkl('user_app_actions') csv_pkl('user') csv_pkl('user_installedapps', protocol=4) gen_demo_result() if not os.path.exists(feature_data_path): os.mkdir(feature_data_path) if not os.path.exists(cache_pkl_path): os.mkdir(cache_pkl_path)