def main(): print("using second score as judgments") input_path = at_data_dir("clueweb", "2009.prels.1-50") raw_entries = [] for line in open(input_path, "r"): query_id, doc_id, s1, s2, s3 = line.split() maybe_relevance = int(s1) maybe_relevance2 = int(s2) some_float = float(s3) e = TrecRelevanceJudgementEntry(query_id, doc_id, maybe_relevance2) raw_entries.append(e) save_path = at_data_dir("clueweb", "2009.qrel_test.2.txt") write_trec_relevance_judgement(raw_entries, save_path)
def main(): todo: List[Tuple[QueryID, MSMarcoDoc]] = get_todo() msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv") passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path) try: passage_dict = load_from_pickle("msmarco_passage_doc_analyze_passage_dict") except FileNotFoundError: passage_dict = load_passage_dict(todo, passage_qrels) doc_queries = dict(load_train_queries()) itr: Iterable[Tuple[str, MSMarcoDoc, JoinedPassage]] = join_doc_passage(todo, passage_qrels, passage_dict) ## for qid, doc, passage in itr: query_text = doc_queries[QueryID(qid)] print('query', qid, query_text) prev = doc.body[:passage.loc] passage_text = passage.text tail = doc.body[passage.loc + len(passage_text):] print("-----") print(prev) print(">>>") print(passage_text) print("<<<") print(tail) print("-----")
def main(): pc_data: List[Dict] = load_claim_perspective_pair() pc_data.sort(key=lambda e: len(e['perspectives']), reverse=True) gold_d: Dict[int, List[PerspectiveCluster]] = load_perspectrum_golds() out_f = open(at_data_dir("perspective", "claims_and_perspective.txt"), "w", encoding="utf-8") for e in pc_data: cid = e['cId'] if not gold_d[cid]: continue text = e['text'] rows = [] row = [str(cid), text] rows.append(row) for pc in gold_d[cid]: rows.append([pc.stance_label_3, pc.stance_label_5]) for pid in pc.perspective_ids: row = [perspective_getter(pid)] rows.append(row) rows.append([]) for row in rows: out_f.write("\t".join(row) + "\n") out_f.write("\n\n\n")
def get_passage_dict(passage_ids_to_find): msmarco_passage_corpus_path = at_data_dir("msmarco", "collection.tsv") passage_dict = {} with open(msmarco_passage_corpus_path, 'r', encoding='utf8') as f: for line in f: passage_id, text = line.split("\t") if passage_id in passage_ids_to_find: passage_dict[passage_id] = text return passage_dict
def main(): pc_data: List[Dict] = load_claim_perspective_pair() out_f = open(at_data_dir("perspective", "claims.txt"), "w") for e in pc_data: cid = e['cId'] text = e['text'] row = [str(cid), text] out_f.write("\t".join(row) + "\n")
def measure_msmarco_passage(): msmarco_passage_corpus_path = at_data_dir("msmarco", "collection.tsv") passage_dict = {} l_list = [] with open(msmarco_passage_corpus_path, 'r', encoding='utf8') as f: for line in f: passage_id, text = line.split("\t") tokens = text.split() l_list.append(len(tokens)) if len(l_list) > 10000: break print(average(l_list))
def load_queries(year_list: Iterable[int]) -> List[TrecQuery]: all_queries = [] for year in year_list: query_path = at_data_dir("clueweb", "{}.topics.xml".format(year)) xml = load_xml(query_path) root_tag = xml.tag assert str(year) in root_tag for idx, topic in enumerate(xml): qid = topic.attrib['number'] query_type = topic.attrib['type'] keyword_query = topic.find('query').text desc_query = topic.find('description').text query = TrecQuery(qid, query_type, keyword_query, desc_query) all_queries.append(query) return all_queries
def main(): data_name = sys.argv[1] gold = load(at_data_dir("genex", "{}_gold.txt".format(data_name)), 999) run1 = load(sys.argv[2], 3) def common(pred, gold): return list([t for t in pred if t in gold]) d1 = NamedAverager() for idx, (t1, t_gold) in enumerate(zip(run1, gold)): c1 = common(t1, t_gold) p1 = len(c1) / len(t1) r1 = len(c1) / len(t_gold) f1 = get_f1(p1, r1) d1['prec'].append(p1) d1['recall'].append(r1) d1['f1'].append(f1) print(d1.get_average_dict())
def main(): split = "train" resource = ProcessedResource10docMulti(split) query_group: List[List[QueryID]] = load_query_group(split) msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv") passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path) qids = query_group[0] qids = qids[:100] pickle_name = "msmarco_passage_doc_analyze_passage_dict_evidence_loc" try: passage_dict = load_from_pickle(pickle_name) except FileNotFoundError: print("Reading passages...") passage_dict = get_passages(qids, passage_qrels) save_to_pickle(passage_dict, pickle_name) def get_rel_doc_id(qid): if qid not in resource.get_doc_for_query_d(): raise KeyError for doc_id in resource.get_doc_for_query_d()[qid]: label = resource.get_label(qid, doc_id) if label: return doc_id raise KeyError def translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body): acc = 0 for idx, tokens in enumerate(stemmed_body_tokens_list): acc += len(tokens) if loc_in_body < acc: return idx return -1 pc_tokenize = PCTokenizer() bert_tokenizer = get_tokenizer() for qid in qids: try: doc_id = get_rel_doc_id(qid) stemmed_tokens_d = resource.get_stemmed_tokens_d(qid) stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[doc_id] rel_passages = list([passage_id for passage_id, score in passage_qrels[qid].items() if score]) success = False found_idx = -1 for rel_passage_id in rel_passages: passage_text = passage_dict[rel_passage_id].strip() passage_tokens = pc_tokenize.tokenize_stem(passage_text) stemmed_body_tokens_flat = lflatten(stemmed_body_tokens_list) n, log = lcs(passage_tokens, stemmed_body_tokens_flat, True) if len(passage_tokens) > 4 and n > len(passage_tokens) * 0.7 and n > 0: success = True _, loc_in_body = log[0] sent_idx = translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body) prev = stemmed_body_tokens_flat[:loc_in_body] loc_by_bert_tokenize = len(bert_tokenizer.tokenize(" ".join(prev))) print(sent_idx, loc_in_body, loc_by_bert_tokenize, len(stemmed_body_tokens_list)) found_idx = sent_idx if not success: print("Not found. doc_lines={} passage_len={}".format(len(stemmed_body_tokens_list), len(passage_tokens))) except KeyError: pass
def load_qrels_for(year) -> QRelsDict: qrel_path = at_data_dir("clueweb", "{}.qrels.txt".format(year)) return load_qrels_structured(qrel_path)
def main(): # run1 = load(at_output_dir("genex", "textrank.txt"), 3) # run2 = load(at_output_dir("genex", "textrank-ts.txt"), 3) problems = load_as_tokens("tdlt") run1 = load(at_output_dir("genex", "1"), 3) run2 = load(at_output_dir("genex", "2_ts"), 3) gold = load(at_data_dir("genex", "tdlt_gold.txt"), 999) def common(pred, gold): return list([t for t in pred if t in gold]) n_correct_1 = 0 n_correct_2 = 0 d1 = NamedAverager() d2 = NamedAverager() for idx, (t1, t2, t_gold, problem) in enumerate(zip(run1, run2, gold, problems)): c1 = common(t1, t_gold) c2 = common(t2, t_gold) p1 = len(c1) / len(t1) r1 = len(c1) / len(t_gold) f1 = get_f1(p1, r1) d1['prec'].append(p1) d1['recall'].append(r1) d1['f1'].append(f1) p2 = len(c2) / len(t2) r2 = len(c2) / len(t_gold) f2 = get_f1(p2, r2) d2['prec'].append(p2) d2['recall'].append(r2) d2['f1'].append(f2) n_correct_1 += len(c1) n_correct_2 += len(c2) if len(c1) != len(c2): print() print(">> Problem ", idx) print("textrank :", c1) print("textrank-ts :", c2) q_match = len(common(problem.query, problem.doc)) n_q = len(problem.query) if len(c1) < len(c2): d2['q_match_rate'].append(q_match / n_q) else: d1['q_match_rate'].append(q_match / n_q) print('query: ', problem.query) print("matching query terms: ", common(problem.query, problem.doc)) print('doc: ', " ".join(problem.doc)) print("{} vs {}".format(n_correct_1, n_correct_2)) print(d1.get_average_dict()) print(d2.get_average_dict()) print(d1.avg_dict['q_match_rate'].history) print(d2.avg_dict['q_match_rate'].history)