Пример #1
0
def main():
    input_path = sys.argv[1]
    save_path = sys.argv[2]
    l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(input_path)

    new_entries: Dict[str, List[TrecRankedListEntry]] = {}
    run_name = "Reverse"

    for qid, ranked_list in l1.items():
        raw_ranked_list = []
        for e in ranked_list:
            score = 1 - e.score
            raw_e = (e.query_id, e.doc_id, score)
            raw_ranked_list.append(raw_e)

        raw_ranked_list.sort(key=lambda x: x[2], reverse=True)

        new_ranked_list = []
        for rank, e in enumerate(raw_ranked_list):
            query_id, doc_id, score = e
            e_new = TrecRankedListEntry(query_id, doc_id, rank, score, run_name)
            new_ranked_list.append(e_new)

        new_entries[qid] = new_ranked_list
    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    write_trec_ranked_list_entry(flat_entries, save_path)
Пример #2
0
def main(config):
    split = config['split']
    top_k = config['top_k']
    word_prob_path = config['word_prob_path']
    run_name = config['run_name']
    save_path = config['save_path']
    if top_k == 50:
        candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_as_qck(split)
    elif top_k == 1000:
        candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_1k_as_qck(split)
    else:
        assert False

    per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path)

    all_ranked_list_entries = []

    for query_id, d in per_query_infos.items():
        scorer = Scorer(d, True)
        candidates: List[QCKCandidate] = candidate_d[query_id]

        entries = []
        for c in candidates:
            e = c.id, scorer.score(c.text)
            entries.append(e)
        entries.sort(key=get_second, reverse=True)

        ranked_list_entries = scores_to_ranked_list_entries(entries, run_name, query_id)
        all_ranked_list_entries.extend(ranked_list_entries)

    write_trec_ranked_list_entry(all_ranked_list_entries, save_path)
Пример #3
0
def main():
    run_config = json.load(open(sys.argv[1], "r"))

    l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(run_config['first_list'])
    l2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(run_config['second_list'])
    run_name = run_config['run_name']
    strategy = run_config['strategy']
    save_path = run_config['save_path']
    k1 = run_config['k1']
    k2 = run_config['k2']
    new_entries: Dict[str, List[TrecRankedListEntry]] = l1

    qid_list = l1.keys()
    for key in l2:
        if key not in qid_list:
            print("WARNING qid {} is not in the first list".format(key))

    for qid in qid_list:
        if qid not in l2:
            new_entries[qid] = l1[qid]
        else:
            entries1 = l1[qid]
            entries2 = l2[qid]
            if strategy == "reciprocal":
                fused_scores = reciprocal_fusion(entries1, entries2, k1, k2)
            elif strategy == "weighted_sum":
                fused_scores = weighted_sum_fusion(entries1, entries2, k1, k2)
            else:
                assert False
            new_entries[qid] = scores_to_ranked_list_entries(fused_scores, run_name, qid)

    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    write_trec_ranked_list_entry(flat_entries, save_path)
Пример #4
0
def save_bm25_as_trec_format():
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 200
    candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text(
        claim_as_query(claims), top_k)
    pred = predict_by_bm25_from_candidate(get_bm25_module(), claims,
                                          candidate_dict, top_k)
    entries = prediction_to_trec_format(pred, "bm25")
    write_trec_ranked_list_entry(
        entries, os.path.join(output_path, "ranked_list", "bm25.txt"))
Пример #5
0
def save_to_common_path(pred_file_path, info_file_path, run_name, max_entry):
    print("Reading from :", pred_file_path)
    score_d = summarize_score(info_file_path, pred_file_path)
    ranked_list = scrore_d_to_trec_style_predictions(score_d, run_name,
                                                     max_entry)

    save_dir = os.path.join(output_path, "ranked_list")
    exist_or_mkdir(save_dir)
    save_path = os.path.join(save_dir, run_name + ".txt")
    write_trec_ranked_list_entry(ranked_list, save_path)
    print("Saved at : ", save_path)
Пример #6
0
def main():
    ranked_list_path = sys.argv[1]
    output_path = sys.argv[2]
    k = int(sys.argv[3])
    rl: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
        ranked_list_path)

    new_ranked_list = []
    for key, value in rl.items():
        new_ranked_list.extend(value[:k])

    write_trec_ranked_list_entry(new_ranked_list, output_path)
Пример #7
0
def save_to_common_path(pred_file_path: str, info_file_path: str,
                        run_name: str, input_type: str, max_entry: int,
                        score_type: str, shuffle_sort: bool):
    f_handler = get_format_handler(input_type)
    info: Dict = load_combine_info_jsons(info_file_path,
                                         f_handler.get_mapping(),
                                         f_handler.drop_kdp())
    print("Info has {} entries".format(len(info)))
    ranked_list = summarize_score(info, pred_file_path, f_handler, score_type)
    save_dir = os.path.join(output_path, "ranked_list")
    exist_or_mkdir(save_dir)
    save_path = os.path.join(save_dir, run_name + ".txt")
    write_trec_ranked_list_entry(ranked_list, save_path)
    print("Saved at : ", save_path)
Пример #8
0
def main():
    first_list_path = sys.argv[1]
    second_list_path = sys.argv[2]
    save_path = sys.argv[3]
    print("Use {} if available, if not use {}".format(first_list_path,
                                                      second_list_path))
    l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
        first_list_path)
    l2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
        second_list_path)

    new_entries: Dict[str, List[TrecRankedListEntry]] = l1

    for qid in l2:
        if qid not in l1:
            new_entries[qid] = l2[qid]

    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    write_trec_ranked_list_entry(flat_entries, save_path)
Пример #9
0
def main():
    first_list_path = sys.argv[1]
    second_list_path = sys.argv[2]
    save_path = sys.argv[3]
    print("From {} select query that are in {}".format(first_list_path,
                                                       second_list_path))
    l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
        first_list_path)
    l2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
        second_list_path)

    new_entries: Dict[str, List[TrecRankedListEntry]] = {}

    for qid in l1:
        if qid in l2:
            new_entries[qid] = l1[qid]

    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    write_trec_ranked_list_entry(flat_entries, save_path)
Пример #10
0
def main():
    split = "dev"
    query_d = dict(load_queries(split))
    bm25_module = get_bm25_module()
    ranked_list_path = at_working_dir("msmarco-doc{}-top100".format(split))
    run_name = "BM25_df100"
    rlg = load_ranked_list_grouped(ranked_list_path)
    save_path = at_output_dir("ranked_list", "mmd_dev_{}.txt".format(run_name))
    te = TimeEstimator(100)
    out_entries = []
    for query_id, entries in rlg.items():
        doc_ids = list([e.doc_id for e in entries])
        docs = load_per_query_docs(query_id, None)

        found_doc_ids = list([d.doc_id for d in docs])
        not_found_doc_ids = list(
            [doc_id for doc_id in doc_ids if doc_id not in found_doc_ids])
        doc_id_len = len(not_found_doc_ids)
        if doc_id_len:
            print("{} docs not found".format(doc_id_len))

        query_text = query_d[QueryID(query_id)]

        def score(doc: MSMarcoDoc):
            content = doc.title + " " + doc.body
            return bm25_module.score(query_text, content)

        scored_docs = list([(d, score(d)) for d in docs])
        scored_docs.sort(key=get_second, reverse=True)

        reranked_entries = []
        for rank, (doc, score) in enumerate(scored_docs):
            e = TrecRankedListEntry(query_id, doc.doc_id, rank, score,
                                    run_name)
            reranked_entries.append(e)
        out_entries.extend(reranked_entries)
        te.tick()

        if len(out_entries) > 100 * 100:
            break

    write_trec_ranked_list_entry(out_entries, save_path)
Пример #11
0
def save_to_common_path(pred_file_path: str, info_file_path: str,
                        run_name: str, input_type: str, max_entry: int,
                        combine_strategy: str, score_type: str,
                        shuffle_sort: bool):
    tprint("Reading info...")
    f_handler = get_format_handler(input_type)
    info: Dict = load_combine_info_jsons(info_file_path,
                                         f_handler.get_mapping(),
                                         f_handler.drop_kdp())
    tprint("Info has {} entries".format(len(info)))
    score_d = get_score_d(pred_file_path, info, f_handler, combine_strategy,
                          score_type)
    ranked_list = scrore_d_to_trec_style_predictions(score_d, run_name,
                                                     max_entry, shuffle_sort)

    save_dir = os.path.join(output_path, "ranked_list")
    exist_or_mkdir(save_dir)
    save_path = os.path.join(save_dir, run_name + ".txt")
    write_trec_ranked_list_entry(ranked_list, save_path)
    tprint("Saved at : ", save_path)
Пример #12
0
def save_over_multiple_files(pred_file_list: List[str], info_file_path: str,
                             run_name: str, input_type: str, max_entry: int,
                             combine_strategy: str, score_type: str):
    f_handler = get_format_handler(input_type)
    info: Dict = load_combine_info_jsons(info_file_path,
                                         f_handler.get_mapping(),
                                         f_handler.drop_kdp())
    print("Info has {} entries".format(len(info)))

    score_d = {}
    for pred_file_path in pred_file_list:
        d = get_score_d(pred_file_path, info, f_handler, combine_strategy,
                        score_type)
        score_d.update(d)
    ranked_list = scrore_d_to_trec_style_predictions(score_d, run_name,
                                                     max_entry)
    save_dir = os.path.join(output_path, "ranked_list")
    exist_or_mkdir(save_dir)
    save_path = os.path.join(save_dir, run_name + ".txt")
    write_trec_ranked_list_entry(ranked_list, save_path)
    print("Saved at : ", save_path)
Пример #13
0
def main():
    run_name = "es"
    for split in ["dev", "test"]:
        claims = load_claims_for_sub_split(split)
        candidates_data: List[Tuple[Dict,
                                    List[Dict]]] = get_all_candidate(claims)

        flat_entries = []
        for c, candidates in candidates_data:
            assert len(candidates) <= 50
            print(len(candidates))
            query_id = str(c["cId"])

            for rank, e in enumerate(candidates):
                doc_id = str(e['pid'])
                score = e['score']
                entry = TrecRankedListEntry(query_id, doc_id, rank, score,
                                            run_name)
                flat_entries.append(entry)

        save_path = os.path.join(output_path, "ranked_list",
                                 "pc_es_{}.txt".format(split))
        write_trec_ranked_list_entry(flat_entries, save_path)
Пример #14
0
def main2():
    rlg_proposed_tfidf = load_ranked_list_grouped(sys.argv[1])
    rlg_proposed_bm25 = load_ranked_list_grouped(sys.argv[2])
    rlg_bert_tfidf = load_ranked_list_grouped(sys.argv[3])
    qrel: QRelsDict = load_qrels_structured(sys.argv[4])

    flat_etr1 = []
    flat_etr3 = []
    for q in rlg_proposed_tfidf:
        entries1 = rlg_proposed_tfidf[q]
        entries2 = rlg_proposed_bm25[q]
        entries3 = rlg_bert_tfidf[q]

        def get_doc_set(entries):
            return set(map(TrecRankedListEntry.get_doc_id, entries))

        docs2 = get_doc_set(entries2)

        d = qrel[q]

        def reform(entries):
            es = list([e for e in entries if e.doc_id not in docs2])

            new_entries = []
            for idx, e in enumerate(es):
                new_entries.append(
                    TrecRankedListEntry(e.query_id, e.doc_id, idx, e.score,
                                        e.run_name))
            return new_entries

        etr1 = reform(entries1)
        flat_etr1.extend(etr1)
        etr3 = reform(entries3)
        flat_etr3.extend(etr3)

    write_trec_ranked_list_entry(flat_etr1, "bm25equi_proposed.txt")
    write_trec_ranked_list_entry(flat_etr3, "bm25equi_bert.txt")
Пример #15
0
def save_to_common_path(run_name, score_d):
    ranked_list = scrore_d_to_trec_style_predictions(score_d, run_name)
    save_path = os.path.join(output_path, "perspective_ranked_list",
                             run_name + ".txt")
    write_trec_ranked_list_entry(ranked_list, save_path)