예제 #1
0
def report_run_per_query(qrels,
                         run_file_name,
                         remove_docs_with_zero_score=False):
    run = TrecRun(run_file_name)
    system = run.run_data['system'][0]
    if remove_docs_with_zero_score:
        run.run_data = run.run_data[run.run_data['score'] > 0]

    trec_eval = TrecEval(run, qrels)

    bpref = trec_eval.getBpref(per_query=True)
    ndcg_10 = trec_eval.getNDCG(depth=10, per_query='query')
    ndcg = trec_eval.getNDCG(per_query='query')

    ret = bpref.join(ndcg_10, on='query')
    ret = ret.join(ndcg, on='query')

    for query, r in ret.iterrows():
        yield json.dumps({
            'corpus': extract_corpus(run_file_name),
            'topic': query,
            'tag': system,
            "bpref": r['Bpref@1000'],
            "pseudoNDCG@10": r['NDCG@10'],
            "pseudoNDCG": r['NDCG@1000']
        })
예제 #2
0
def report_run(qrels, run_file_name, remove_docs_with_zero_score=False):
    run = TrecRun(run_file_name)
    system = run.run_data['system'][0]
    if remove_docs_with_zero_score:
        run.run_data = run.run_data[run.run_data['score'] > 0]

    trec_eval = TrecEval(run, qrels)

    ret = {
        'corpus': extract_corpus(run_file_name),
        'topics': extract_topics(run_file_name),
        'tag': system,
        "bpref": trec_eval.getBpref(),
        "pseudoNDCG@10": trec_eval.getNDCG(depth=10),
        "pseudoNDCG": trec_eval.getNDCG()
    }

    return json.dumps(ret)
예제 #3
0
def reciprocal_rank_fusion(trec_runs, k=60, max_docs=1000, output=sys.stdout):
    """
        Implements a reciprocal rank fusion as define in
        ``Reciprocal Rank fusion outperforms Condorcet and individual Rank Learning Methods`` by Cormack, Clarke and Buettcher.

        Parameters:
            k: term to avoid vanishing importance of lower-ranked documents. Default value is 60 (default value used in their paper).
            output: a file pointer to write the results. Sys.stdout is the default.
    """

    outputRun = TrecRun()
    rows = []
    topics = trec_runs[0].topics()

    for topic in sorted(topics):
        doc_scores = {}
        for r in trec_runs:
            docs_for_run = r.get_top_documents(topic, n=1000)

            for pos, docid in enumerate(docs_for_run, start=1):
                doc_scores[docid] = doc_scores.get(docid,
                                                   0.0) + 1.0 / (k + pos)

        # Writes out information for this topic
        for rank, (docid, score) in enumerate(sorted(iter(doc_scores.items()),
                                                     key=lambda x:
                                                     (-x[1], x[0]))[:max_docs],
                                              start=1):
            # output.write("%s Q0 %s %d %f reciprocal_rank_fusion_k=%d\n" % (str(topic), docid, rank, score, k))
            rows.append((topic, "Q0", docid, rank, score,
                         "reciprocal_rank_fusion_k=%d" % k))

    df = pd.DataFrame(rows)
    df.columns = ["query", "q0", "docid", "rank", "score", "system"]
    df["q0"] = df["q0"].astype(np.str)
    outputRun.run_data = df.copy()

    return outputRun