def qk_candidate_gen(q_res_path: str, doc_score_path, split, config) -> List[Tuple[QCKQuery, List[KDP]]]: queries: List[QCKQuery] = get_qck_queries(split) num_jobs = d_n_claims_per_split2[split] score_d = load_doc_scores(doc_score_path, num_jobs) tprint("loading ranked list") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) query_ids = list(ranked_list.keys()) query_ids.sort() print("num queries", len(query_ids)) q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)} print("Pre loading docs") top_n = config['top_n'] out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = [] all_doc_parts = 0 ticker = TimeEstimator(len(queries)) for q in queries: job_id: int = q_id_to_job_id[q.query_id] entries: List = score_d[job_id] entries.sort(key=get_second, reverse=True) doc_ids = left(entries) doc_ids = doc_ids[:top_n] preload_man.preload(TokenizedCluewebDoc, doc_ids) docs = iterate_docs(doc_ids) doc_part_list: List[KDP] = iterate_document_parts( docs, config['window_size'], config['step_size'], 20) all_doc_parts += len(doc_part_list) out_qk.append((q, doc_part_list)) ticker.tick() return out_qk
def sentence_payload_gen(q_res_path: str, top_n, data_id_man: DataIDManager): print("loading ranked list") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) qid_list = list(ranked_list.keys()) qid_list = qid_list[:10] ranked_list = {k: ranked_list[k] for k in qid_list} print("Pre loading docs") preload_docs(ranked_list, top_n) entries: List[Tuple[str, bool, int]] = [] def enum_sentence(tokens) -> Iterator[str]: text = " ".join(tokens) sents = sent_tokenize(text) yield from sents ticker = TimeEstimator(len(ranked_list)) for qid in ranked_list: q_res: List[SimpleRankedListEntry] = ranked_list[qid] docs = iterate_docs(q_res, top_n) for doc in docs: for sent_idx, sent in enumerate(enum_sentence(doc.tokens)): info = { 'doc_id': doc.doc_id, 'sent_idx': sent_idx, 'sentence': sent } data_id = data_id_man.assign(info) e = sent, True, data_id entries.append(e) ticker.tick() return entries
def a_relevant_candidate(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) all_passages = [] entries = [] all_docs = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_text = c['text'] def get_passage_score(dummy): return 0 passages: List[Tuple[List[str], float]] = iterate_passages( q_res, top_n, get_passage_score) all_docs += len(passages) all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs ".format(len(claims), all_docs)) data = entries, all_passages save_to_pickle(data, save_name)
def main(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) claims = get_all_claims() claim_d = claims_to_dict(claims) keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://localhost:36559/document?identifier=" rows = [] for query_id in keys[:10]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = remove_duplicate(list([e.doc_id for e in entries])) claim = claim_d[int(query_id)] s = "{} : {}".format(query_id, claim) rows.append([Cell(s)]) for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id s = "<a href=\"{}\">{}</a>".format(url, doc_id) rows.append([Cell(s)]) html = HtmlVisualizer("claim_docs_urls.html") html.write_table(rows)
def main(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) query_text_d = json.load(open(config['query_text_d'])) save_name = config['save_path'] keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://localhost:36559/document?identifier=" rows = [] for query_id in keys[:100]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = list([e.doc_id for e in entries]) query_text = query_text_d[query_id] s = "{} : {}".format(query_id, query_text) rows.append([Cell(s)]) for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id s = "<a href=\"{}\">{}</a>".format(url, doc_id) rows.append([Cell(s)]) html = HtmlVisualizer(save_name) html.write_table(rows)
def insert_ranked_list_from_path(file_path: FilePath, q_config_id: str): ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(file_path) for query_id in ranked_list: q_res_id: QueryResultID = QueryResultID("{}_{}".format( query_id, q_config_id)) insert_ranked_list(q_res_id, ranked_list[query_id])
def load_all_ranked_list(ranked_list_save_root, disk_name): d = {} for idx in train_query_indices: file_name = "{}_{}.txt".format(disk_name, idx) file_path = os.path.join(ranked_list_save_root, file_name) d.update(load_galago_ranked_list(file_path)) return d
def __init__(self, q_res_path, config, top_n): self.config = config self.top_n = top_n print("loading ranked list") self.ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) print("Ranked list loaded for {} queries".format(len(self.ranked_list))) print("Pre loading docs") preload_docs(self.ranked_list, top_n)
def __init__(self, q_res_path, top_n, window_size): self.robust_tokens: Dict[str, List[str]] = load_robust_tokens_for_predict() self.ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) self.top_n = top_n self.window_size = window_size
def a_relevant(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 tokenizer = PCTokenizer() all_passages = [] entries = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists)) data = entries, all_passages save_to_pickle(data, save_name)
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) stopwords = load_stopwords_for_query() alpha = 0.7 tokenizer = PCTokenizer() for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 docs = [] for i in range(top_n): try: doc = load_doc(q_res[i].doc_id) docs.append(doc) except KeyError: docs.append(None) pass print(c['text']) rows = [] for rank, doc in enumerate(docs): if doc is None: rows.append((rank, "-", "-")) continue scores = get_doc_score(doc, get_passage_score) avg_score = average(scores) max_score = max(scores) rows.append((rank, avg_score, max_score)) print_table(rows)
def __init__(self, doc_ids, judgement_path, query): print("DataSample init") # load doc_lisself.doc_ids = load_doc_list(doc_id_path)t self.q_group = load_galago_ranked_list(judgement_path) # load query-judgement self.doc_ids = list(doc_ids) self.query = query self.n_sample_ranked = 5 self.n_sample_not_ranked = 3
def __init__(self, q_res_path, query_d: Dict[int, str], out_dir): self.ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) query_ids = list(self.ranked_list.keys()) query_ids.sort() self.job_id_to_q_id = {job_id: q_id for job_id, q_id in enumerate(query_ids)} self.query_d: Dict[int, str] = query_d self.tokenizer = get_tokenizer() self.max_seq_length = 512 self.out_dir = out_dir self.info_out_dir = out_dir + "_info" exist_or_mkdir(self.info_out_dir)
def main(): file_path = sys.argv[1] top_n = int(sys.argv[2]) save_path = sys.argv[3] ranked_list_d: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(file_path) def get_head(l: List): return l[:top_n] new_ranked_list = dict_value_map(get_head, ranked_list_d) write_ranked_list_from_s(new_ranked_list, save_path)
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def load_ranked_list(relevance_list_path): all_ranked_list = {} for file_path in get_dir_files(relevance_list_path): file_name = os.path.basename(file_path) ranked_list_d = load_galago_ranked_list(file_path) queries = ranked_list_d.keys() any_query = list(queries)[0] ranked_list = ranked_list_d[any_query] all_ranked_list[file_name] = ranked_list return all_ranked_list
def show_missing(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims[:10] top_n = 100 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) report_missing(claims, ranked_list, top_n)
def main(): split = "train" subjectivity_path = sys.argv[1] q_res_path = sys.argv[2] ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) # load LM claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split) bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) alpha = 0.1 stopwords = load_stopwords_for_query() # load subjectivity predictions. subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path) doc_ids = subj_d.keys() preload_man.preload(TokenizedCluewebDoc, doc_ids) tokenizer = PCTokenizer() lm_scores = [] rates = [] num_subj_list = [] num_sent_list = [] for claim_lm in claim_lms: qid = str(claim_lm.cid) log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 for entry in ranked_list[qid]: if entry.doc_id in subj_d: tokens = load_doc(entry.doc_id) assert type(tokens[0]) == str lm_score = get_passage_score(tokens) num_subj, num_sent = subj_d[entry.doc_id] rate = num_subj / num_sent lm_scores.append(lm_score) rates.append(rate) num_subj_list.append(num_subj) num_sent_list.append(num_sent) print("lm scores correlation with ") print("rates: ", pearsonr(lm_scores, rates)) print("num subj: ", pearsonr(lm_scores, num_subj_list)) print("num sent: ", pearsonr(lm_scores, num_sent_list))
def main(config): def get_worker(out_dir): writer = Writer(max_seq_length=config['max_seq_length'], reverse=config['reverse']) return KDPParaWorker(config, writer, out_dir) q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) num_job = len(ranked_list) - 1 runner = JobRunner(job_man_dir, num_job, config['job_name'], get_worker) runner.auto_runner()
def verify_ranked_list(out_path, queries): n_query = len(queries) file_name = os.path.basename(out_path) ranked_list_d = load_galago_ranked_list(out_path) if len(ranked_list_d) < n_query: print("{} has only {} queries, expected {}".format( file_name, len(ranked_list_d), n_query)) found_query_ids = set(ranked_list_d.keys()) queries_d = dict(lmap(lambda x: (x["number"], x["text"]), queries)) expected_query_ids = lmap(lambda x: x["number"], queries) not_found_query_ids = list( [t for t in expected_query_ids if t not in found_query_ids]) for query_id in not_found_query_ids: print("Not found: ", queries_d[query_id])
def main(): train_queries, test_queries = get_query_split() out_dir = pjoin(output_path, "eHealth") exist_or_mkdir(out_dir) ranked_list_path = FilePath( "/mnt/nfs/work3/youngwookim/data/CLEF_eHealth_working/ranked_list_filtered" ) ranked_list: RankedListDict = load_galago_ranked_list(ranked_list_path) qrels = load_clef_qrels() new_d = {} for query in test_queries: new_d[query.qid] = ranked_list[query.qid] save_path = os.path.join(out_dir, 'test_baseline.list') write_ranked_list_from_d(new_d, save_path)
def qk_candidate_gen(q_res_path: str, queries: List[QCKQuery], top_n, config) -> List[Tuple[QCKQuery, List[KDP]]]: print("loading ranked list") ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) print("Pre loading docs") preload_docs(ranked_list, top_n) entries: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = [] all_doc_parts = 0 ticker = TimeEstimator(len(queries)) for q in queries: q_res: List[SimpleRankedListEntry] = ranked_list[q.query_id] doc_part_list = enum_doc_parts_from_ranked_list(config, q_res, top_n) all_doc_parts += len(doc_part_list) entries.append((q, doc_part_list)) ticker.tick() return entries
def __init__( self, config, writer, out_dir, ): q_res_path = config['q_res_path'] self.top_n = config['top_n'] self.num_sent = config['num_sent'] self.max_seq_length = config['max_seq_length'] self.ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) self.cids = lmap(int, self.ranked_list.keys()) self.pid_dict = first_pid_as_rep() self.out_dir = out_dir self.writer = writer
def work(q_res_path, save_name): ranked_list_d = load_galago_ranked_list(q_res_path) window_size = 10 stemmer = CacheStemmer() print(q_res_path) ticker = TimeEstimator(len(ranked_list_d)) r = [] for claim_id, ranked_list in ranked_list_d.items(): ticker.tick() doc_ids = list([e.doc_id for e in ranked_list]) print("1") counter = build_co_occurrence(get_tokens_form_doc_ids(doc_ids), window_size, stemmer) print("2") r.append((claim_id, counter)) save_to_pickle(r, save_name)
def load_multiple_ranked_list(dir_path, get_key_from_name): files = get_dir_files(dir_path) data = [] for file_path in files: name = os.path.basename(file_path) ranked_list_d = load_galago_ranked_list(file_path) for query, ranked_list in ranked_list_d.items(): data.append((name, ranked_list)) new_d = {} key_fn = lambda x: get_key_from_name(x[0]) for key, sub_data in group_by(data, key_fn).items(): ranked_list = right(sub_data) new_d[key] = merge_ranked_list_list(ranked_list) return new_d
def main(): train_queries, test_queries = get_query_split() out_dir = pjoin(output_path, "eHealth") exist_or_mkdir(out_dir) train_save_path = pjoin(out_dir, "tfrecord_train") test_save_path = pjoin(out_dir, "tfrecord_test") ranked_list_path = FilePath( os.path.join(output_path, "eHealth", "bm25_filtered.list")) ranked_list: RankedListDict = load_galago_ranked_list(ranked_list_path) qrels = load_clef_qrels() train_info = write_tfrecord(ranked_list, train_queries, qrels, train_save_path) save_to_pickle(train_info, "eHealth_train_info") test_info = write_tfrecord(ranked_list, test_queries, qrels, test_save_path) save_to_pickle(test_info, "eHealth_test_info")
def do_datagen(d_ids, q_res_path, save_name): claims: List[Dict] = get_claims_from_ids(d_ids) ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) claim_lms = build_gold_lms(claims) bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) alpha = 0.1 max_seq_length = 512 generator = get_generator(max_seq_length, bg_lm, alpha) out_dir = os.path.join(env_data_dir, save_name) exist_or_mkdir(out_dir) for claim_lm in claim_lms: print(claim_lm.cid) records: List[Record] = generator(claim_lm, ranked_list[str(claim_lm.cid)]) output_path = os.path.join(out_dir, str(claim_lm.cid)) write_records(records, max_seq_length, output_path)
def main(): queries = load_queries() bm25_path = pjoin(cord_working_dir, "youngwoo_bm25_query") ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(bm25_path) out_path = os.path.join(cord_working_dir, "tfrecord_2_4") max_seq_length = 512 meat_data: List[Dict] = read_csv_as_dict(meta_data_path) text_dict = {} for e in meat_data: text_dict[e[str_cord_uid]] = e[str_title] + " " + e[str_abstract] def get_text_from_doc_id(doc_id:str) -> str: return text_dict[doc_id] data_info_save_name = "data_info_save" tf_record_gen(ranked_list, queries, get_text_from_doc_id, out_path, max_seq_length, data_info_save_name)
def work(): q_config_id = Q_CONFIG_ID_BM25_UKP ranked_list_save_root = get_ranked_list_save_dir(q_config_id) doc_ids = set() ticker = TimeEstimator(num_query_file) for i in range(num_query_file): file_name = FileName("{}_{}.txt".format(index_name_list[0], str(i))) ranked_list_path = pjoin(ranked_list_save_root, file_name) rl: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list( ranked_list_path) for key, value in rl.items(): for entry in value[:100]: doc_ids.add(entry.doc_id) ticker.tick() f = open("{}_uniq_100".format(q_config_id), "w") for doc_id in doc_ids: f.write("{}\n".format(doc_id)) f.close()
def write_csv(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) claims = get_all_claims() claim_d = claims_to_dict(claims) keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://gosford.cs.umass.edu:36559/document?identifier=" rows = [] header = ["claim" ] + ["url{}".format(i) for i in range(1, num_doc_per_query + 1)] rows.append(header) for query_id in keys[:10]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = remove_duplicate(list([e.doc_id for e in entries])) claim = claim_d[int(query_id)] urls = [] for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id urls.append(url) assert len(urls) == num_doc_per_query row = [claim] + urls rows.append(row) save_path = os.path.join(output_path, "claim10_train.csv") f = open(save_path, "w") csv_writer = csv.writer(f) csv_writer.writerows(rows)