def my_eval(pre_y, valid_df, answer, phase=-1, print_mark=True): # 构造submit csv valid_submit = utils.save_pre_as_submit_format_csv(valid_df, pre_y) submit_csv_path = utils.save(valid_submit, file_dir='./cache/tmp_phase_submit') # 构造truth csv valid_answer = valid_df.loc[:, ['user_id']].drop_duplicates(['user_id'], keep='first') # answer中user列唯一、item列也是唯一 valid_answer = valid_answer.merge(answer, on='user_id', how='left') valid_answer_save_path = './cache/tmp_phase_submit/valid_answer.csv' valid_answer = valid_answer[['phase_id', 'user_id', 'item_id', 'item_deg']] valid_answer.to_csv(valid_answer_save_path, index=False, header=False) score, \ ndcg_50_full, ndcg_50_half, \ hitrate_50_full, hitrate_50_half = evaluate(submit_csv_path, valid_answer_save_path, recall_num=None) if print_mark: print( 'phase:{}, score:{}, ndcg_50_full:{}, ndcg_50_half:{}, hitrate_50_full:{}, hitrate_50_half:{}' .format(phase, score, ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half)) return np.array( [score, ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half]).reshape(-1, )
'item_id': np.str }) if conf.subsampling: recall_feature_df = recall_feature_df[ recall_feature_df['user_id'].isin( one_phase_recall_item_df['user_id'])] print('load recall features: phase:{} shape:{}'.format( phase, recall_feature_df.shape[0])) else: # featuring recall_feature_df = do_featuring( all_phase_click_no_qtime, recall_sample_df, hot_df, conf.process_num, item_txt_embedding_dim, is_recall=True, feature_caching_path=conf.recall_feature_path.format(phase)) submit_x = recall_feature_df[recall_feature_df.columns.difference( ['user_id', 'item_id', 'label'])].values submit_pre_y = model.predict_proba(submit_x)[:, 1] submit = utils.save_pre_as_submit_format_csv(recall_sample_df, submit_pre_y) submit_all = submit_all.append(submit) print('--------------------------- 保存预测文件 --------------------------') utils.save(submit_all, 50) # todo 统计不同阶段时间分段情况 # todo user点击深度超过10的情况怎么处理
print('------------------------ 模型训练 start time:{}'.format(time_str)) # submit = train_model_lgb(feature_all, recall_rate=hit_rate, hot_list=hot_list, valid=0.2, topk=50, num_boost_round=1, early_stopping_rounds=1) # submit = train_model_rf(train_test, recall_rate=1, hot_list=hot_list, valid=0.2, topk=50) model = rank_rf(train_x, train_y) # model = rank_xgb(train_x, train_y) # joblib.dump(model, './cache/rf.pkl') print('------------------------ 模型验证 ----------------------------') time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print( '------------------------ 模型验证 part1:采样集合验证 start time:{}'.format( time_str)) pre_y = model.predict_proba(valid_x)[:, 1] # 构造submit csv valid_submit = save_pre_as_submit_format_csv(valid_data, pre_y) submit_csv_path = utils.save(valid_submit, file_dir=temp_result_path) # 构造truth csv valid_answer = valid_data.loc[:, ['user_id']].drop_duplicates( ['user_id'], keep='first') # answer中user列唯一、item列也是唯一 valid_answer = valid_answer.merge(debias_track_answer, on='user_id', how='left') valid_answer_save_path = './cache/tmp_phase_submit/valid_answer.csv' valid_answer = valid_answer[[ 'phase_id', 'user_id', 'item_id', 'item_deg' ]] valid_answer.to_csv(valid_answer_save_path, index=False, header=False) score, \