Пример #1
0
def my_eval(pre_y, valid_df, answer, phase=-1, print_mark=True):
    # 构造submit csv
    valid_submit = utils.save_pre_as_submit_format_csv(valid_df, pre_y)
    submit_csv_path = utils.save(valid_submit,
                                 file_dir='./cache/tmp_phase_submit')

    # 构造truth csv
    valid_answer = valid_df.loc[:, ['user_id']].drop_duplicates(['user_id'],
                                                                keep='first')
    # answer中user列唯一、item列也是唯一
    valid_answer = valid_answer.merge(answer, on='user_id', how='left')
    valid_answer_save_path = './cache/tmp_phase_submit/valid_answer.csv'
    valid_answer = valid_answer[['phase_id', 'user_id', 'item_id', 'item_deg']]
    valid_answer.to_csv(valid_answer_save_path, index=False, header=False)

    score, \
    ndcg_50_full, ndcg_50_half, \
    hitrate_50_full, hitrate_50_half = evaluate(submit_csv_path, valid_answer_save_path,
                                                recall_num=None)

    if print_mark:
        print(
            'phase:{}, score:{}, ndcg_50_full:{}, ndcg_50_half:{}, hitrate_50_full:{}, hitrate_50_half:{}'
            .format(phase, score, ndcg_50_full, ndcg_50_half, hitrate_50_full,
                    hitrate_50_half))

    return np.array(
        [score, ndcg_50_full, ndcg_50_half, hitrate_50_full,
         hitrate_50_half]).reshape(-1, )
Пример #2
0
                    'item_id': np.str
                })
            if conf.subsampling:
                recall_feature_df = recall_feature_df[
                    recall_feature_df['user_id'].isin(
                        one_phase_recall_item_df['user_id'])]
            print('load recall features: phase:{} shape:{}'.format(
                phase, recall_feature_df.shape[0]))
        else:
            # featuring
            recall_feature_df = do_featuring(
                all_phase_click_no_qtime,
                recall_sample_df,
                hot_df,
                conf.process_num,
                item_txt_embedding_dim,
                is_recall=True,
                feature_caching_path=conf.recall_feature_path.format(phase))

        submit_x = recall_feature_df[recall_feature_df.columns.difference(
            ['user_id', 'item_id', 'label'])].values
        submit_pre_y = model.predict_proba(submit_x)[:, 1]
        submit = utils.save_pre_as_submit_format_csv(recall_sample_df,
                                                     submit_pre_y)
        submit_all = submit_all.append(submit)

    print('--------------------------- 保存预测文件 --------------------------')
    utils.save(submit_all, 50)

    # todo 统计不同阶段时间分段情况
    # todo user点击深度超过10的情况怎么处理
Пример #3
0
        print('------------------------ 模型训练 start time:{}'.format(time_str))
        # submit = train_model_lgb(feature_all, recall_rate=hit_rate, hot_list=hot_list, valid=0.2, topk=50, num_boost_round=1, early_stopping_rounds=1)
        # submit = train_model_rf(train_test, recall_rate=1, hot_list=hot_list, valid=0.2, topk=50)
        model = rank_rf(train_x, train_y)
        # model = rank_xgb(train_x, train_y)
        # joblib.dump(model, './cache/rf.pkl')
        print('------------------------ 模型验证 ----------------------------')
        time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        print(
            '------------------------ 模型验证 part1:采样集合验证 start time:{}'.format(
                time_str))
        pre_y = model.predict_proba(valid_x)[:, 1]

        # 构造submit csv
        valid_submit = save_pre_as_submit_format_csv(valid_data, pre_y)
        submit_csv_path = utils.save(valid_submit, file_dir=temp_result_path)

        # 构造truth csv
        valid_answer = valid_data.loc[:, ['user_id']].drop_duplicates(
            ['user_id'], keep='first')
        # answer中user列唯一、item列也是唯一
        valid_answer = valid_answer.merge(debias_track_answer,
                                          on='user_id',
                                          how='left')
        valid_answer_save_path = './cache/tmp_phase_submit/valid_answer.csv'
        valid_answer = valid_answer[[
            'phase_id', 'user_id', 'item_id', 'item_deg'
        ]]
        valid_answer.to_csv(valid_answer_save_path, index=False, header=False)

        score, \