def main(): input_path = sys.argv[1] save_path = sys.argv[2] l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(input_path) new_entries: Dict[str, List[TrecRankedListEntry]] = {} run_name = "Reverse" for qid, ranked_list in l1.items(): raw_ranked_list = [] for e in ranked_list: score = 1 - e.score raw_e = (e.query_id, e.doc_id, score) raw_ranked_list.append(raw_e) raw_ranked_list.sort(key=lambda x: x[2], reverse=True) new_ranked_list = [] for rank, e in enumerate(raw_ranked_list): query_id, doc_id, score = e e_new = TrecRankedListEntry(query_id, doc_id, rank, score, run_name) new_ranked_list.append(e_new) new_entries[qid] = new_ranked_list flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) write_trec_ranked_list_entry(flat_entries, save_path)
def main(config): split = config['split'] top_k = config['top_k'] word_prob_path = config['word_prob_path'] run_name = config['run_name'] save_path = config['save_path'] if top_k == 50: candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_as_qck(split) elif top_k == 1000: candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_1k_as_qck(split) else: assert False per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path) all_ranked_list_entries = [] for query_id, d in per_query_infos.items(): scorer = Scorer(d, True) candidates: List[QCKCandidate] = candidate_d[query_id] entries = [] for c in candidates: e = c.id, scorer.score(c.text) entries.append(e) entries.sort(key=get_second, reverse=True) ranked_list_entries = scores_to_ranked_list_entries(entries, run_name, query_id) all_ranked_list_entries.extend(ranked_list_entries) write_trec_ranked_list_entry(all_ranked_list_entries, save_path)
def main(): run_config = json.load(open(sys.argv[1], "r")) l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(run_config['first_list']) l2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(run_config['second_list']) run_name = run_config['run_name'] strategy = run_config['strategy'] save_path = run_config['save_path'] k1 = run_config['k1'] k2 = run_config['k2'] new_entries: Dict[str, List[TrecRankedListEntry]] = l1 qid_list = l1.keys() for key in l2: if key not in qid_list: print("WARNING qid {} is not in the first list".format(key)) for qid in qid_list: if qid not in l2: new_entries[qid] = l1[qid] else: entries1 = l1[qid] entries2 = l2[qid] if strategy == "reciprocal": fused_scores = reciprocal_fusion(entries1, entries2, k1, k2) elif strategy == "weighted_sum": fused_scores = weighted_sum_fusion(entries1, entries2, k1, k2) else: assert False new_entries[qid] = scores_to_ranked_list_entries(fused_scores, run_name, qid) flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) write_trec_ranked_list_entry(flat_entries, save_path)
def save_bm25_as_trec_format(): d_ids: List[int] = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) top_k = 200 candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text( claim_as_query(claims), top_k) pred = predict_by_bm25_from_candidate(get_bm25_module(), claims, candidate_dict, top_k) entries = prediction_to_trec_format(pred, "bm25") write_trec_ranked_list_entry( entries, os.path.join(output_path, "ranked_list", "bm25.txt"))
def save_to_common_path(pred_file_path, info_file_path, run_name, max_entry): print("Reading from :", pred_file_path) score_d = summarize_score(info_file_path, pred_file_path) ranked_list = scrore_d_to_trec_style_predictions(score_d, run_name, max_entry) save_dir = os.path.join(output_path, "ranked_list") exist_or_mkdir(save_dir) save_path = os.path.join(save_dir, run_name + ".txt") write_trec_ranked_list_entry(ranked_list, save_path) print("Saved at : ", save_path)
def main(): ranked_list_path = sys.argv[1] output_path = sys.argv[2] k = int(sys.argv[3]) rl: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( ranked_list_path) new_ranked_list = [] for key, value in rl.items(): new_ranked_list.extend(value[:k]) write_trec_ranked_list_entry(new_ranked_list, output_path)
def save_to_common_path(pred_file_path: str, info_file_path: str, run_name: str, input_type: str, max_entry: int, score_type: str, shuffle_sort: bool): f_handler = get_format_handler(input_type) info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp()) print("Info has {} entries".format(len(info))) ranked_list = summarize_score(info, pred_file_path, f_handler, score_type) save_dir = os.path.join(output_path, "ranked_list") exist_or_mkdir(save_dir) save_path = os.path.join(save_dir, run_name + ".txt") write_trec_ranked_list_entry(ranked_list, save_path) print("Saved at : ", save_path)
def main(): first_list_path = sys.argv[1] second_list_path = sys.argv[2] save_path = sys.argv[3] print("Use {} if available, if not use {}".format(first_list_path, second_list_path)) l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( first_list_path) l2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( second_list_path) new_entries: Dict[str, List[TrecRankedListEntry]] = l1 for qid in l2: if qid not in l1: new_entries[qid] = l2[qid] flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) write_trec_ranked_list_entry(flat_entries, save_path)
def main(): first_list_path = sys.argv[1] second_list_path = sys.argv[2] save_path = sys.argv[3] print("From {} select query that are in {}".format(first_list_path, second_list_path)) l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( first_list_path) l2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( second_list_path) new_entries: Dict[str, List[TrecRankedListEntry]] = {} for qid in l1: if qid in l2: new_entries[qid] = l1[qid] flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) write_trec_ranked_list_entry(flat_entries, save_path)
def main(): split = "dev" query_d = dict(load_queries(split)) bm25_module = get_bm25_module() ranked_list_path = at_working_dir("msmarco-doc{}-top100".format(split)) run_name = "BM25_df100" rlg = load_ranked_list_grouped(ranked_list_path) save_path = at_output_dir("ranked_list", "mmd_dev_{}.txt".format(run_name)) te = TimeEstimator(100) out_entries = [] for query_id, entries in rlg.items(): doc_ids = list([e.doc_id for e in entries]) docs = load_per_query_docs(query_id, None) found_doc_ids = list([d.doc_id for d in docs]) not_found_doc_ids = list( [doc_id for doc_id in doc_ids if doc_id not in found_doc_ids]) doc_id_len = len(not_found_doc_ids) if doc_id_len: print("{} docs not found".format(doc_id_len)) query_text = query_d[QueryID(query_id)] def score(doc: MSMarcoDoc): content = doc.title + " " + doc.body return bm25_module.score(query_text, content) scored_docs = list([(d, score(d)) for d in docs]) scored_docs.sort(key=get_second, reverse=True) reranked_entries = [] for rank, (doc, score) in enumerate(scored_docs): e = TrecRankedListEntry(query_id, doc.doc_id, rank, score, run_name) reranked_entries.append(e) out_entries.extend(reranked_entries) te.tick() if len(out_entries) > 100 * 100: break write_trec_ranked_list_entry(out_entries, save_path)
def save_to_common_path(pred_file_path: str, info_file_path: str, run_name: str, input_type: str, max_entry: int, combine_strategy: str, score_type: str, shuffle_sort: bool): tprint("Reading info...") f_handler = get_format_handler(input_type) info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp()) tprint("Info has {} entries".format(len(info))) score_d = get_score_d(pred_file_path, info, f_handler, combine_strategy, score_type) ranked_list = scrore_d_to_trec_style_predictions(score_d, run_name, max_entry, shuffle_sort) save_dir = os.path.join(output_path, "ranked_list") exist_or_mkdir(save_dir) save_path = os.path.join(save_dir, run_name + ".txt") write_trec_ranked_list_entry(ranked_list, save_path) tprint("Saved at : ", save_path)
def save_over_multiple_files(pred_file_list: List[str], info_file_path: str, run_name: str, input_type: str, max_entry: int, combine_strategy: str, score_type: str): f_handler = get_format_handler(input_type) info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp()) print("Info has {} entries".format(len(info))) score_d = {} for pred_file_path in pred_file_list: d = get_score_d(pred_file_path, info, f_handler, combine_strategy, score_type) score_d.update(d) ranked_list = scrore_d_to_trec_style_predictions(score_d, run_name, max_entry) save_dir = os.path.join(output_path, "ranked_list") exist_or_mkdir(save_dir) save_path = os.path.join(save_dir, run_name + ".txt") write_trec_ranked_list_entry(ranked_list, save_path) print("Saved at : ", save_path)
def main(): run_name = "es" for split in ["dev", "test"]: claims = load_claims_for_sub_split(split) candidates_data: List[Tuple[Dict, List[Dict]]] = get_all_candidate(claims) flat_entries = [] for c, candidates in candidates_data: assert len(candidates) <= 50 print(len(candidates)) query_id = str(c["cId"]) for rank, e in enumerate(candidates): doc_id = str(e['pid']) score = e['score'] entry = TrecRankedListEntry(query_id, doc_id, rank, score, run_name) flat_entries.append(entry) save_path = os.path.join(output_path, "ranked_list", "pc_es_{}.txt".format(split)) write_trec_ranked_list_entry(flat_entries, save_path)
def main2(): rlg_proposed_tfidf = load_ranked_list_grouped(sys.argv[1]) rlg_proposed_bm25 = load_ranked_list_grouped(sys.argv[2]) rlg_bert_tfidf = load_ranked_list_grouped(sys.argv[3]) qrel: QRelsDict = load_qrels_structured(sys.argv[4]) flat_etr1 = [] flat_etr3 = [] for q in rlg_proposed_tfidf: entries1 = rlg_proposed_tfidf[q] entries2 = rlg_proposed_bm25[q] entries3 = rlg_bert_tfidf[q] def get_doc_set(entries): return set(map(TrecRankedListEntry.get_doc_id, entries)) docs2 = get_doc_set(entries2) d = qrel[q] def reform(entries): es = list([e for e in entries if e.doc_id not in docs2]) new_entries = [] for idx, e in enumerate(es): new_entries.append( TrecRankedListEntry(e.query_id, e.doc_id, idx, e.score, e.run_name)) return new_entries etr1 = reform(entries1) flat_etr1.extend(etr1) etr3 = reform(entries3) flat_etr3.extend(etr3) write_trec_ranked_list_entry(flat_etr1, "bm25equi_proposed.txt") write_trec_ranked_list_entry(flat_etr3, "bm25equi_bert.txt")
def save_to_common_path(run_name, score_d): ranked_list = scrore_d_to_trec_style_predictions(score_d, run_name) save_path = os.path.join(output_path, "perspective_ranked_list", run_name + ".txt") write_trec_ranked_list_entry(ranked_list, save_path)