Пример #1
0
    def __init__(self, split):
        query_group: List[List[QueryID]] = load_query_group(split)
        qrel: SimpleQrel = load_msmarco_simple_qrels(split)

        self.split = split
        self.queires = dict(load_queries(split))
        self.query_group = query_group
        self.tokenizer = get_tokenizer()
        self.qrel = qrel
Пример #2
0
    def __init__(self, split, load_candidate_doc_list_fn):
        query_group: List[List[QueryID]] = load_query_group(split)
        candidate_docs_d: Dict[QueryID, List[str]] = load_candidate_doc_list_fn(split)
        qrel: SimpleQrel = load_msmarco_simple_qrels(split)

        self.split = split
        self.queires = dict(load_queries(split))
        self.query_group = query_group
        self.tokenizer = get_tokenizer()
        self.candidate_doc_d: Dict[QueryID, List[str]] = candidate_docs_d
        self.qrel = qrel
        self.tokenizer = get_tokenizer()
Пример #3
0
    def __init__(self, split):
        super(ProcessedResource10doc, self).__init__(split)
        query_group: List[List[QueryID]] = load_query_group(split)
        candidate_docs_d: Dict[QueryID, List[str]] = load_candidate_doc_list_10(split)
        qrel: SimpleQrel = load_msmarco_simple_qrels(split)

        self.split = split
        self.queires = dict(load_queries(split))
        self.query_group = query_group
        self.tokenizer = get_tokenizer()
        self.candidate_doc_d: Dict[QueryID, List[str]] = candidate_docs_d
        self.qrel = qrel
        self.tokenizer = get_tokenizer()
Пример #4
0
def main():
    split = "dev"
    query_d = dict(load_queries(split))
    bm25_module = get_bm25_module()
    ranked_list_path = at_working_dir("msmarco-doc{}-top100".format(split))
    run_name = "BM25_df100"
    rlg = load_ranked_list_grouped(ranked_list_path)
    save_path = at_output_dir("ranked_list", "mmd_dev_{}.txt".format(run_name))
    te = TimeEstimator(100)
    out_entries = []
    for query_id, entries in rlg.items():
        doc_ids = list([e.doc_id for e in entries])
        docs = load_per_query_docs(query_id, None)

        found_doc_ids = list([d.doc_id for d in docs])
        not_found_doc_ids = list(
            [doc_id for doc_id in doc_ids if doc_id not in found_doc_ids])
        doc_id_len = len(not_found_doc_ids)
        if doc_id_len:
            print("{} docs not found".format(doc_id_len))

        query_text = query_d[QueryID(query_id)]

        def score(doc: MSMarcoDoc):
            content = doc.title + " " + doc.body
            return bm25_module.score(query_text, content)

        scored_docs = list([(d, score(d)) for d in docs])
        scored_docs.sort(key=get_second, reverse=True)

        reranked_entries = []
        for rank, (doc, score) in enumerate(scored_docs):
            e = TrecRankedListEntry(query_id, doc.doc_id, rank, score,
                                    run_name)
            reranked_entries.append(e)
        out_entries.extend(reranked_entries)
        te.tick()

        if len(out_entries) > 100 * 100:
            break

    write_trec_ranked_list_entry(out_entries, save_path)