def evidence_list_to_text(cursor, evidences, contain_head=True, id_tokenized=False): current_evidence_text = [] evidences = sorted(evidences, key=lambda x: (x[0], x[1])) cur_head = 'DO NOT INCLUDE THIS FLAG' for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) if contain_head and cur_head != doc_id: cur_head = doc_id if not id_tokenized: doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) else: t_doc_id_natural_format = common.doc_id_to_tokenized_text( doc_id) if line_num != 0: current_evidence_text.append(f"{t_doc_id_natural_format} <t>") # Important change move one line below: July 16 current_evidence_text.append(e_text) # print(current_evidence_text) return ' '.join(current_evidence_text)
def convert_to_normalized_format(cursor, e_list, contain_head=True): r_list = [] for evidences in e_list: current_evidence = [] cur_head = 'DO NOT INCLUDE THIS FLAG' # if len(evidences) >= 2: # print("!!!") # This is important sorting of all evidences. evidences = sorted(evidences, key=lambda x: (x[0], x[1])) # print(evidences) for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) if contain_head and cur_head != doc_id: cur_head = doc_id doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) if line_num != 0: current_evidence.append(f"{t_doc_id_natural_format} .") # print(e_text) current_evidence.append(e_text) # print(current_evidence) r_list.append(' '.join(current_evidence)) return r_list
def sample_for_verifiable(cursor, e_list, contain_head=True): r_list = [] for evidences in e_list: current_evidence = [] cur_head = 'DO NOT INCLUDE THIS FLAG' # if len(evidences) >= 2: # print("!!!") for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) if contain_head and cur_head != doc_id: cur_head = doc_id doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) if line_num != 0: current_evidence.append(f"{t_doc_id_natural_format} .") # print(e_text) current_evidence.append(e_text) # print(current_evidence) r_list.append(' '.join(current_evidence)) return r_list
def pageview_spiral_aside_rule(self): if not hasattr(self, 'wiki_pv'): print("Reload wiki pageview dict") self.wiki_pv = WikiPageviews() item = self.item docid_groups = [[i[0] for i in it] \ for _, it in item['structured_docids_aside'].items()] changed = False for key, group_prio_docids in item['structured_docids_aside'].items(): group_docids = [it[0] for it in group_prio_docids] if len(group_docids) > 1: changed = True all_scores = map(lambda x: self.wiki_pv[fever_db.convert_brc(x)], group_docids) all_scores = np.array(list(all_scores)) prios = np.argsort(all_scores)[::-1] new_gpd = [] for i, p in enumerate(prios): # new_gpd.append((group_prio_docids[p][0], # group_prio_docids[p][1] + \ # max(1.0 - i*0.2, 0))) new_gpd.append((group_prio_docids[p][0], max(1.0 - i*0.2, 0))) item['structured_docids_aside'][key] = new_gpd if changed: finded_keys = item['structured_docids_aside'].values() finded_keys = set([i for ii in finded_keys for i in ii]) \ if len(finded_keys) > 0 else set(finded_keys) item['prioritized_docids_aside'] = list(finded_keys) return self
def tokenize_doc_id(doc_id, tokenizer): # path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') # print(path_stanford_corenlp_full_2017_06_09) # # drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) # tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) doc_id_natural_format = fever_db.convert_brc(doc_id).replace('_', ' ') tokenized_doc_id = e_tokenize(doc_id_natural_format, tokenizer) t_doc_id_natural_format = tokenized_doc_id.words() lemmas = tokenized_doc_id.lemmas() return t_doc_id_natural_format, lemmas
def parse_doc_id(doc_id, tokenizer=None): doc_id = convert_brc(doc_id) doc_id = doc_id.replace('_', ' ') tokens = None lemmas = None if tokenizer is not None: tok_r = tokenizer.tokenize(doc_id) tokens = tok_r.words() lemmas = tok_r.lemmas() return tokens, lemmas
def pageview_analysis(): from chaonan_src._doc_retrieval.item_rules import ItemRuleBuilder from chaonan_src._utils.doc_utils import read_jsonl from utils.fever_db import convert_brc wiki_pv = WikiPageviews() d_list = read_jsonl( "../../../results/doc_retri/docretri.titlematch/dev.jsonl") gt_evidences, pre_evidences = [], [] for item in d_list: gt_evidences.extend(ItemRuleBuilder\ .get_all_docid_in_evidence(item['evidence'])) pre_evidences.extend([it[0] for it in item['prioritized_docids']]) gt_evidences = set(gt_evidences) pre_evidences = set(pre_evidences) gt_count = [wiki_pv[convert_brc(it)] for it in gt_evidences] pre_count = [wiki_pv[convert_brc(it)] for it in pre_evidences] from IPython import embed embed() import os os._exit(1)
def did_to_keys(doc_id, tokenizer=None): doc_id = convert_brc(doc_id) doc_id = doc_id.replace('_', ' ') id_keys = [] # id_keys.append(doc_id) lemmas = None entities = None if tokenizer is not None: tok_r = tokenizer.tokenize(doc_id) to_key = ' '.join(tok_r.words()) id_keys.append(to_key) lemmas = tok_r.lemmas() entities = tok_r.entity_groups() return list(set(id_keys)), lemmas, entities
def convert_to_formatted_sent(zipped_s_id_list, evidence_set, contain_head=True, id_tokenized=True): sent_list = [] for sent, sid in zipped_s_id_list: sent_item = dict() cur_sent = sent doc_id, ln = sid.split('(-.-)')[0], int(sid.split('(-.-)')[1]) # print(sent, doc_id, ln) if contain_head: if not id_tokenized: doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) else: t_doc_id_natural_format = common.doc_id_to_tokenized_text( doc_id) if ln != 0 and t_doc_id_natural_format.lower() not in sent.lower(): cur_sent = f"{t_doc_id_natural_format} <t> " + sent sent_item['text'] = cur_sent sent_item['sid'] = doc_id + c_scorer.SENT_LINE + str(ln) # sid is '[doc_id]<SENT_LINE>[line_number]' if evidence_set is not None: if (doc_id, ln) in evidence_set: sent_item['selection_label'] = "true" else: sent_item['selection_label'] = "false" else: sent_item['selection_label'] = "hidden" sent_list.append(sent_item) else: sent_list.append(sent_item) # for s in sent_list: # print(s['text'][:20], s['selection_label']) return sent_list
def pageview_rule(self): """Assign high priority to frequently viewed pages """ if not hasattr(self, 'wiki_pv'): print("Reload wiki pageview dict") self.wiki_pv = WikiPageviews() item = self.item docid_groups = [[i[0] for i in it] \ for _, it in item['structured_docids'].items()] for key, group_prio_docids in item['structured_docids'].items(): group_docids = [it[0] for it in group_prio_docids] all_scores = map(lambda x: self.wiki_pv[convert_brc(x)], group_docids) all_scores = np.array(list(all_scores)) prios = np.argsort(all_scores)[::-1] new_gpd = [] for i, p in enumerate(prios): # new_gpd.append((group_prio_docids[p][0], # group_prio_docids[p][1] + \ # max(1.0 - i*0.2, 0))) new_gpd.append((group_prio_docids[p][0], int(all_scores[p]))) item['structured_docids'][key] = new_gpd try: finded_keys = item['structured_docids'].values() finded_keys = set([i for ii in finded_keys for i in ii]) \ if len(finded_keys) > 0 else set(finded_keys) item['prioritized_docids'] = list(finded_keys) except Exception as e: from IPython import embed embed() import os os._exit(1) return self