def convert_to_formatted_sent(zipped_s_id_list, evidence_set, contain_head=True, id_tokenized=True): sent_list = [] for sent, sid in zipped_s_id_list: sent_item = dict() cur_sent = sent doc_id, ln = sid.split('(-.-)')[0], int(sid.split('(-.-)')[1]) # print(sent, doc_id, ln) if contain_head: if not id_tokenized: doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) else: t_doc_id_natural_format = common.doc_id_to_tokenized_text( doc_id) if ln != 0 and t_doc_id_natural_format.lower() not in sent.lower(): cur_sent = f"{t_doc_id_natural_format} <t> " + sent sent_item['text'] = cur_sent sent_item['sid'] = doc_id + c_scorer.SENT_LINE + str(ln) # sid is '[doc_id]<SENT_LINE>[line_number]' if evidence_set is not None: if (doc_id, ln) in evidence_set: sent_item['selection_label'] = "true" else: sent_item['selection_label'] = "false" else: sent_item['selection_label'] = "hidden" sent_list.append(sent_item) else: sent_list.append(sent_item) # for s in sent_list: # print(s['text'][:20], s['selection_label']) return sent_list
def evidence_list_to_text_list(cursor, evidences, contain_head=True, id_tokenized=False): # One evidence one text and len(evidences) == len(text_list) current_evidence_text_list = [] evidences = sorted(evidences, key=lambda x: (x[0], x[1])) cur_head = 'DO NOT INCLUDE THIS FLAG' for doc_id, line_num in evidences: _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num) cur_text = "" if contain_head and cur_head != doc_id: cur_head = doc_id if not id_tokenized: doc_id_natural_format = fever_db.convert_brc(doc_id).replace( '_', ' ') t_doc_id_natural_format = ' '.join( easy_tokenize(doc_id_natural_format)) else: t_doc_id_natural_format = common.doc_id_to_tokenized_text( doc_id) if line_num != 0: cur_text = f"{t_doc_id_natural_format} <t> " # Important change move one line below: July 16 # current_evidence_text.append(e_text) cur_text = cur_text + e_text current_evidence_text_list.append(cur_text) assert len(evidences) == len(current_evidence_text_list) return current_evidence_text_list
def select_sent_with_prob_for_eval_list(input_file, additional_file, prob_dict_file, tokenized=False, pipeline=False, is_demo=False): """ This method select sentences with upstream sentence retrieval. :param input_file: This should be the file with 5 sentences selected. :return: """ cursor = fever_db.get_cursor() if isinstance(additional_file, list): additional_d_list = additional_file else: additional_d_list = load_data(additional_file) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item d_list = input_file for item in tqdm(d_list): e_list = additional_data_dict[item['id']]['predicted_sentids'] if not pipeline: assert additional_data_dict[item['id']]['label'] == item['label'] assert additional_data_dict[ item['id']]['verifiable'] == item['verifiable'] assert additional_data_dict[item['id']]['id'] == item['id'] pred_evidence_list = [] for i, cur_e in enumerate(e_list): doc_id = cur_e.split(c_scorer.SENT_LINE)[0] ln = int(cur_e.split( c_scorer.SENT_LINE)[1]) # Important changes Bugs: July 21 pred_evidence_list.append((doc_id, ln)) pred_evidence = check_sentences.Evidences(pred_evidence_list) evidence_text_list = evidence_list_to_text_list(cursor, pred_evidence, contain_head=True, id_tokenized=tokenized) evidences = sorted(pred_evidence, key=lambda x: (x[0], x[1])) item_id = int(item['id']) evidence_text_list_with_prob = [] for text, (doc_id, ln) in zip(evidence_text_list, evidences): ssid = (item_id, doc_id, int(ln)) if ssid not in prob_dict_file: print("Some sentence pair don't have 'prob'.") prob = 0.5 else: prob = prob_dict_file[ssid]['prob'] assert item['claim'] == prob_dict_file[ssid]['claim'] evidence_text_list_with_prob.append((text, prob)) if tokenized: pass else: item['claim'] = ' '.join(easy_tokenize(item['claim'])) item['evid'] = evidence_text_list_with_prob item['predicted_evidence'] = convert_evidence2scoring_format(e_list) item['predicted_sentids'] = e_list # This change need to be saved. # item['predicted_label'] = additional_data_dict[item['id']]['label'] return d_list
def adv_simi_sample_with_prob_v1_1(input_file, additional_file, prob_dict_file, tokenized=False): cursor = fever_db.get_cursor() d_list = load_data(input_file) if isinstance(additional_file, list): additional_d_list = additional_file else: additional_d_list = load_data(additional_file) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item sampled_data_list = [] count = 0 for item in tqdm(d_list): # e_list = check_sentences.check_and_clean_evidence(item) sampled_e_list, flags = sample_additional_data_for_item_v1_1( item, additional_data_dict) # print(flags) for i, (sampled_evidence, flag) in enumerate(zip(sampled_e_list, flags)): # Do not copy, might change in the future for error analysis # new_item = copy.deepcopy(item) new_item = dict() # print(new_item['claim']) # print(e_list) # print(sampled_evidence) # print(flag) evidence_text_list = evidence_list_to_text_list( cursor, sampled_evidence, contain_head=True, id_tokenized=tokenized) evidences = sorted(sampled_evidence, key=lambda x: (x[0], x[1])) item_id = int(item['id']) evidence_text_list_with_prob = [] for text, (doc_id, ln) in zip(evidence_text_list, evidences): ssid = (int(item_id), doc_id, int(ln)) if ssid not in prob_dict_file: count += 1 print("Some sentence pair don't have 'prob'.") prob = 0.5 else: prob = prob_dict_file[ssid]['prob'] assert item['claim'] == prob_dict_file[ssid]['claim'] evidence_text_list_with_prob.append((text, prob)) new_item['id'] = str(item['id']) + '#' + str(i) if tokenized: new_item['claim'] = item['claim'] else: new_item['claim'] = ' '.join(easy_tokenize(item['claim'])) new_item['evid'] = evidence_text_list_with_prob new_item['verifiable'] = item['verifiable'] new_item['label'] = item['label'] # print("C:", new_item['claim']) # print("E:", new_item['evid']) # print("L:", new_item['label']) # print() sampled_data_list.append(new_item) cursor.close() print(count) return sampled_data_list