def gen_item_stats_feature(updata=False): feat_path = os.path.join(feats_root, 'item_click_stats.pkl') if os.path.exists(feat_path) and updata == False: print('Found ' + feat_path) else: dfal = get_nominal_dfal() dfal = add_item_total_da_click(dfal) dfal = add_item_da_feature_click(dfal) print('generating ' + feat_path) columns_da = list( filter(lambda x: x.endswith('_click_da'), dfal.columns.values)) columns_ho = list( filter(lambda x: x.endswith('_click_ho'), dfal.columns.values)) tbar = tqdm(columns_da) for col in tbar: tbar.set_description('add_item_click_stats ' + col) dfal = gen_item_click_stats(dfal, col) print('add_item_click_stats completed.') feat_names = list( filter(lambda x: '_click_da_' in x, dfal.columns.values)) dfal = dfal[feat_names + ['item_id']].drop_duplicates(['item_id']) print('gen_item_stats_feature shape:', dfal.shape) dump_pickle(dfal, feat_path) print('gen_item_stats_feature completed.')
def gen_level_aggs(col, updata=False): feat_path = os.path.join(feats_root,'level_aggs_{}.pkl'.format(col)) if os.path.exists(feat_path) and updata == False: print('Found ' + feat_path) else: print('Generating ' + feat_path) dfal = get_nominal_dfal()[[col, 'da'] + level_cols] dmax = dfal.da.max() dmin = dfal.da.min() level_agg = None for da in sorted(dfal.da.unique())[1:]: da_agg = None for win_das in [1, 2, 3]: if da - win_das < dmin: continue agg = gen_level_agg_features(dfal, da, win_das, col) print('Generated {} {} {}'.format(col, da, win_das)) if da_agg is None: da_agg = agg else: da_agg = da_agg.merge(agg, how='outer') if level_agg is None: level_agg = da_agg else: level_agg = pd.concat([level_agg, da_agg], axis=0) level_agg.fillna(0, inplace=True) level_agg, _ = reduce_mem_usage(level_agg) print(level_agg.shape) level_agg, _ = reduce_mem_usage(level_agg) dump_pickle(level_agg, feat_path)
def gen_hist_cvr_smooth(start_da, end_da, key, alpha=0.25): dfal = get_nominal_dfal() dfal = dfal.loc[dfal.da <= end_da, [key, 'da', 'is_trade']] gc.collect() for da in tqdm(np.arange(start_da, end_da + 1)): feat_path = os.path.join( feats_root, key + '_hist_cvr_smooth_da_' + str(da) + '.pkl') if os.path.exists(feat_path): print('found ' + feat_path) else: print('generating ' + feat_path) dfcv = dfal.copy().loc[dfal.da < da] dfcv.is_trade = dfcv.is_trade.apply(int) dfcv = pd.get_dummies(dfcv, columns=['is_trade'], prefix='label') dfcv = dfcv.groupby([key], as_index=False).sum() dfcv[key + '_cvr'] = (dfcv['label_1'] + alpha) / ( dfcv['label_0'] + dfcv['label_1'] + alpha * 2) result = pd.merge(dfal.loc[dfal.da == da, ['da', key]], dfcv.loc[:, [key, key + '_cvr']], 'left', on=[ key, ]) result.drop_duplicates(['da', key], inplace=True) result.sort_values(['da', key], inplace=True) dump_pickle(result.loc[:, ['da', key, key + '_cvr']], feat_path)
def gen_shop_da_feature_click(updata=False): """生成用户相关所有数据的每天点击统计量""" dfal = get_nominal_dfal() stats_feat = [ 'item_category_list', 'item_brand_id', 'item_city_id', 'user_gender_id', 'user_occupation_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level', 'user_star_level', 'context_page_id', 'item_id', 'user_id' ] tbar = tqdm(stats_feat) for feat in tbar: feat_path = os.path.join(feats_root, 'shop_' + feat + '_click_da.pkl') if os.path.exists(feat_path) and updata == False: tbar.set_description('Found {:>60}'.format( os.path.basename(feat_path))) else: tbar.set_description('Generating {:>60}'.format( os.path.basename(feat_path))) shop_feat_click_da = dfal.groupby( ['shop_id', 'da', feat]).size().reset_index().rename( columns={0: 'agg_shop_%s_click_da' % feat}) dump_pickle(shop_feat_click_da, feat_path) print('gen_shop_da_feature_click completed.')
def gen_user_total_da_click(update=False): dfal = get_nominal_dfal() feat_path = os.path.join(feats_root, 'user_total_click_da.pkl') if os.path.exists(feat_path) and update == False: print('Found ' + feat_path) else: print('Generating ' + feat_path) user_all_click_da = dfal.groupby(['user_id', 'da']) .size().reset_index() .rename(columns={0: 'agg_user_total_click_da'}) dump_pickle(user_all_click_da, feat_path) print('gen_user_total_da_click completed.')
def gen_id_global_sum_count(last_da=23, stats_feats=[ 'item_id', 'shop_id', 'user_id', 'item_brand_id', 'item_city_id', 'hm' ]): dfal = get_nominal_dfal() dfal = dfal.loc[dfal.da < last_da, stats_feats] for feat in tqdm(stats_feats): feat_path = os.path.join( feats_root, 'global_count_' + feat + '_lastda' + str(last_da) + '.pkl') if os.path.exists(feat_path): print('found ' + feat_path) else: print('generating ' + feat_path) feat_count_sum = pd.DataFrame( dfal.groupby(feat).size()).reset_index().rename( columns={0: 'agg_' + feat + '_sum_count'}) dump_pickle(feat_count_sum, feat_path)
def gen_target_aggs(col, updata=False): feat_path = os.path.join(feats_root, 'target_aggs_{}.pkl'.format(col)) if os.path.exists(feat_path) and updata == False: print('Found ' + feat_path) else: print('Generating ' + feat_path) dfal = get_nominal_dfal()[[col, 'da', 'is_trade']] dmax = dfal.da.max() dmin = dfal.da.min() for da in sorted(dfal.da.unique())[1:]: for win_das in [1, 2, 3]: if da - win_das < dmin: continue dfal = gen_target_agg_features(dfal, da, win_das, col) dfal = dfal.loc[dfal.da > 17, :] dfal.drop(['is_trade'], inplace=True, axis=1) dfal.drop_duplicates([col, 'da'], inplace=True) dfal.fillna(0, inplace=True) dfal, _ = reduce_mem_usage(dfal) dump_pickle(dfal, feat_path)
def gen_item_ho_feature_click(updata=False): """生成用户相关所有数据的每天每小时点击统计量""" dfal = get_nominal_dfal() stats_feat = [ 'shop_id', 'user_id', 'user_gender_id', 'user_occupation_id', 'user_age_level', 'user_star_level', 'context_page_id', 'shop_review_num_level', 'shop_star_level' ] tbar = tqdm(stats_feat) for feat in tbar: feat_path = os.path.join(feats_root, 'item_' + feat + '_click_ho.pkl') if os.path.exists(feat_path) and updata == False: tbar.set_description('Found {:>60}'.format( os.path.basename(feat_path))) else: tbar.set_description('Generating {:>60}'.format( os.path.basename(feat_path))) item_feat_click_ho = dfal.groupby( ['item_id', 'da', 'ho', feat]).size().reset_index().rename( columns={0: 'agg_item_%s_click_ho' % feat}) dump_pickle(item_feat_click_ho, feat_path) print('gen_item_ho_feature_click completed.')
def gen_final_dataset(tr_start_da, tr_end_da, te_da=24, updata=False): dfal = get_nominal_dfal() dfal = dfal.sort_values('dt') user_time_delta_feature = load_pickle( './feats/user_time_delta_feature.pkl') user_last_attrs_feature = load_pickle( './feats/user_last_attrs_feature.pkl') dfal = pd.concat([dfal, user_time_delta_feature, user_last_attrs_feature], axis=1) print(dfal.shape) dftr = dfal.loc[(dfal.da >= tr_start_da) & (dfal.da <= tr_end_da)] tr_dump_file = './cache/final_dataset_tr_{}_{}.h5'.format( tr_start_da, tr_end_da) dftr = gen_dataset(dftr, tr_dump_file, 'tr', tr_end_da, updata) dfte = dfal.loc[dfal.da == te_da] te_dump_file = './cache/final_dataset_te_{}.h5'.format(te_da) dfte = gen_dataset(dfte, te_dump_file, 'te', te_da, updata) del dfal gc.collect() return dftr, dfte
def gen_final_dataset(tr_start_da, tr_end_da, te_da=24): tr_dump_file = './cache/final_dataset_tr_{}_{}.h5'.format( tr_start_da, tr_end_da) te_dump_file = './cache/final_dataset_te_{}.h5'.format(te_da) dftr = None dfte = None if os.path.exists(tr_dump_file): print('Found ' + tr_dump_file) store = pd.HDFStore( tr_dump_file, mode='r', complevel=9, ) dftr = store['dataset'] store.close() elif dftr is None: dfal = get_nominal_dfal() dftr = dfal.loc[(dfal.da >= tr_start_da) & (dfal.da <= tr_end_da)] print('Generating Train Dataset...') ################################################################## # add user click #dftr = add_user_click_stats(dftr) #dftr = add_user_total_da_click(dftr) #dftr = add_user_da_feature_click(dftr) #dftr = add_user_ho_feature_click(dftr) # add item click #dftr = add_item_click_stats(dftr) #dftr = add_item_total_da_click(dftr) #dftr = add_item_da_feature_click(dftr) #dftr = add_item_ho_feature_click(dftr) # add shop click #dftr = add_shop_click_stats(dftr) #dftr = add_shop_total_da_click(dftr) #dftr = add_shop_da_feature_click(dftr) #dftr = add_shop_ho_feature_click(dftr) # add global count sum dftr = add_global_count_sum(dftr, tr_end_da) # add smooth cvr for c in tqdm(ordinal_cate_cols + nominal_cate_cols + identity_cols + ['hm', 'mi'], desc='add_hist_cvr_smooth'): dftr = add_hist_cvr_smooth(dftr, c) print('add_hist_cvr_smooth completed') #for c in tqdm(['item_id', 'shop_id','user_id', 'item_brand_id','item_city_id','hm', 'mi'], desc='add_target_features'): # dftr = add_target_features(dftr, c) #print('add_target_features completed') # for c in tqdm(nominal_cate_cols + ['hm', 'mi', 'ho'], desc='add_level_features'): # dftr = add_level_features(dftr, c) print('add_level_features completed') print(dftr.shape) store = pd.HDFStore(tr_dump_file, mode='w', complevel=9) store['dataset'] = dftr store.close() del dfal gc.collect() print('Generated Train Dataset') if os.path.exists(te_dump_file): print('Found ' + te_dump_file) store = pd.HDFStore(te_dump_file, mode='r', complevel=9) dfte = store['dataset'] store.close() elif dfte is None: dfal = get_nominal_dfal() dfte = dfal.loc[dfal.da == te_da] ################################################################## print('Generating Test Dataset...') # add user click #dfte = add_user_click_stats(dfte) #dfte = add_user_total_da_click(dfte) #dfte = add_user_da_feature_click(dfte) #dfte = add_user_ho_feature_click(dfte) # add item click #dfte = add_item_click_stats(dfte) #dfte = add_item_total_da_click(dfte) #dfte = add_item_da_feature_click(dfte) #dfte = add_item_ho_feature_click(dfte) # add shop click #dfte = add_shop_click_stats(dfte) #dfte = add_shop_total_da_click(dfte) #dfte = add_shop_da_feature_click(dfte) #dfte = add_shop_ho_feature_click(dfte) # add global count sum dfte = add_global_count_sum(dfte, te_da) # add smooth cvr for c in tqdm(ordinal_cate_cols + nominal_cate_cols + identity_cols + ['hm', 'mi'], desc='add_hist_cvr_smooth'): dfte = add_hist_cvr_smooth(dfte, c) print('add_hist_cvr_smooth completed') #for c in tqdm(['item_id','shop_id','user_id', 'item_brand_id','item_city_id','hm', 'mi'], desc='add_target_features'): # dfte = add_target_features(dfte, c) #print('add_target_features completed') # for c in tqdm(['item_id','shop_id','user_id','item_brand_id','item_city_id', 'hm', 'mi', 'ho'], desc='add_level_features'): # dfte = add_level_features(dfte, c) # print('add_level_features completed') print(dfte.shape) store = pd.HDFStore(te_dump_file, mode='w', complevel=9) store['dataset'] = dfte store.close() del dfal gc.collect() print('Generated Test Dataset') #dftr.drop(unused_cols, axis=1, inplace=True) #dfte.drop(unused_cols, axis=1, inplace=True) return dftr, dfte
# In[31]: cat3 = cb.CatBoostClassifier(**best_cat_params) cat3 = cat3.fit(X_tr, y_tr, eval_set=(X_va, y_va)) # In[32]: y_hat_cat3 = cat3.predict_proba(X_te)[:, 1] # In[33]: verbose_feature_importance_cat(cat3, X_tr) # In[35]: dfal = get_nominal_dfal() hat = dfal.loc[dfal.da == 24, ['instance_id']] del dfal gc.collect() # In[36]: hat.shape, y_hat_cat1.shape # In[37]: hat['lgb1'] = y_hat_lgb1 hat['lgb2'] = y_hat_lgb2 hat['cat1'] = y_hat_cat1 hat['cat2'] = y_hat_cat2 hat['cat3'] = y_hat_cat3