def expand_from_preext_sent_rule(self): if not hasattr(self, 'cursor'): self.cursor = fever_db.get_cursor() if not hasattr(self, 'preext_sent_dict'): d_list = load_data(config.RESULT_PATH / \ "sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl") self.preext_sent_dict = {item['id']: item for item in d_list} item = self.item # if len(item['prioritized_docids']) < 5: new_pdocids = copy(item['prioritized_docids']) sent_ids = self.preext_sent_dict[item['id']]['predicted_sentids'] for sent_id in sent_ids: docid, sent_ind = sent_id.split('<SENT_LINE>') sent_ind = int(sent_ind) id_list, sent_list, sent_links = \ fever_db.get_evidence(self.cursor, docid, sent_ind) sent_links = json.loads(sent_links) all_links = np.array(sent_links) all_links = np.array(all_links) all_links = all_links.reshape(-1, 2)[:, 1] all_links = list(map(reverse_convert_brc, all_links)) new_pdocids.extend([(id_link, 1.0) \ for id_link in all_links]) item['prioritized_docids'] = new_pdocids return self
def expand_from_preext_sent_rule(self): if not hasattr(self, 'cursor'): self.cursor = fever_db.get_cursor() if not hasattr(self, 'preext_sent_dict'): d_list = read_jsonl(config.RESULT_PATH / \ "sent_retri_nn/2018_07_17_16-34-19_r/train_scale(0.1).jsonl") self.preext_sent_dict = {item['id']: item for item in d_list} item = self.item # if len(item['prioritized_docids']) < 5: new_pdocids = [] structured_docids_sent = {} sent_ids = self.preext_sent_dict[item['id']]['scored_sentids'] for sent_id, score, probability in sent_ids: docid, sent_ind = sent_id.split('<SENT_LINE>') sent_ind = int(sent_ind) id_list, sent_list, sent_links = \ fever_db.get_evidence(self.cursor, docid, sent_ind) sent_links = json.loads(sent_links) all_links = np.array(sent_links) all_links = np.array(all_links) all_links = all_links.reshape(-1, 2)[:, 1] all_links = list(map(fever_db.reverse_convert_brc, all_links)) all_links = list(map(lambda x: x.replace(' ', '_'), all_links)) prio_docids = [(id_link, score) for id_link in all_links] new_pdocids.extend(prio_docids) structured_docids_sent.update({sent_id: prio_docids}) item['prioritized_docids_sent'] = new_pdocids item['structured_docids_sent'] = structured_docids_sent return self
def utest_check_sentence_lines(): sent_number_coutner = Counter() number_list = [] db_cursor = fever_db.get_cursor() # d_list = load_data("/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/dev.jsonl") d_list = load_data( "/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/train.jsonl" ) for item in tqdm(d_list): p_docids = item['predicted_docids'] current_sent_list = [] for doc_id in p_docids: r_list = fever_db.get_all_sent_by_doc_id(db_cursor, doc_id) current_sent_list.extend(r_list) sent_number_coutner.update([len(current_sent_list)]) number_list.append(len(current_sent_list)) # print(current_sent_list) print(len(number_list)) print('Mean:', np.mean(number_list)) print('Max:', np.max(number_list)) print('Min:', np.min(number_list)) print('Std:', np.std(number_list)) print(sent_number_coutner)
def initialize(self): print('Data reader initialization ...') self.cursor = fever_db.get_cursor() # Prepare Data token_indexers = { 'tokens': \ SingleIdTokenIndexer(namespace='tokens'), 'elmo_chars': \ ELMoTokenCharactersIndexer(namespace='elmo_characters') } self.fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=cfg.lazy) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT \ / 'vocab_cache' \ / 'nli_basic') # THis is important ns = 'selection_labels' vocab.add_token_to_namespace('true', namespace=ns) vocab.add_token_to_namespace('false', namespace=ns) vocab.add_token_to_namespace('hidden', namespace=ns) vocab.change_token_with_index_to_namespace('hidden', -2, namespace=ns) # Label value vocab.get_index_to_token_vocabulary(ns) self.vocab = vocab self.weight_dict = weight_dict self.initialized = True
def adv_sample_v1_0(input_file, additional_file, tokenized=False): cursor = fever_db.get_cursor() d_list = load_data(input_file) if isinstance(additional_file, list): additional_d_list = additional_file else: additional_d_list = load_data(additional_file) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item sampled_data_list = [] for item in tqdm(d_list): # e_list = check_sentences.check_and_clean_evidence(item) sampled_e_list, flags = sample_additional_data_for_item_v1_0( item, additional_data_dict) # print(flags) for i, (sampled_evidence, flag) in enumerate(zip(sampled_e_list, flags)): # Do not copy, might change in the future for error analysis # new_item = copy.deepcopy(item) new_item = dict() # print(new_item['claim']) # print(e_list) # print(sampled_evidence) # print(flag) evidence_text = evidence_list_to_text(cursor, sampled_evidence, contain_head=True, id_tokenized=tokenized) new_item['id'] = str(item['id']) + '#' + str(i) if tokenized: new_item['claim'] = item['claim'] else: new_item['claim'] = ' '.join(easy_tokenize(item['claim'])) new_item['evid'] = evidence_text new_item['verifiable'] = item['verifiable'] new_item['label'] = item['label'] # print("C:", new_item['claim']) # print("E:", new_item['evid']) # print("L:", new_item['label']) # print() sampled_data_list.append(new_item) cursor.close() return sampled_data_list
def disambiguous_from_preext_sent_rule(self): if not hasattr(self, 'cursor'): self.cursor = fever_db.get_cursor() if not hasattr(self, 'preext_sent_dict'): d_list = read_jsonl(config.RESULT_PATH / \ "sent_retri_nn/2018_07_17_16-34-19_r/train_sent.jsonl") self.preext_sent_dict = {item['id']: item for item in d_list} item = self.item if len(item['prioritized_docids']) > 60: sent_ids = self.preext_sent_dict[item['id']][''] return self
def select_sent_for_eval(input_file, additional_file, tokenized=False): """ This method select sentences with upstream sentence retrieval. :param input_file: This should be the file with 5 sentences selected. :return: """ cursor = fever_db.get_cursor() if isinstance(additional_file, list): additional_d_list = additional_file else: additional_d_list = load_data(additional_file) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item d_list = load_data(input_file) for item in tqdm(d_list): e_list = additional_data_dict[item['id']]['predicted_sentids'] assert additional_data_dict[item['id']]['label'] == item['label'] assert additional_data_dict[item['id']]['id'] == item['id'] assert additional_data_dict[ item['id']]['verifiable'] == item['verifiable'] pred_evidence_list = [] for i, cur_e in enumerate(e_list): doc_id = cur_e.split(c_scorer.SENT_LINE)[0] ln = int(cur_e.split( c_scorer.SENT_LINE)[1]) # Important changes Bugs: July 21 pred_evidence_list.append((doc_id, ln)) pred_evidence = check_sentences.Evidences(pred_evidence_list) evidence_text = evidence_list_to_text(cursor, pred_evidence, contain_head=True, id_tokenized=tokenized) if tokenized: pass else: item['claim'] = ' '.join(easy_tokenize(item['claim'])) item['evid'] = evidence_text item['predicted_evidence'] = convert_evidence2scoring_format(e_list) item['predicted_sentids'] = e_list # This change need to be saved. # item['predicted_label'] = additional_data_dict[item['id']]['label'] return d_list
def if_idf_select_sentence(): db_cursor = fever_db.get_cursor() loaded_path = "/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/dev.jsonl" d_list = load_data(loaded_path) # d_list = load_data("/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/train.jsonl") for item in tqdm(d_list): # print() p_docids = item['predicted_docids'] cleaned_claim = ' '.join(easy_tokenize(item['claim'])) # print(cleaned_claim) current_sent_list = [] current_id_list = [] for doc_id in p_docids: r_list, id_list = fever_db.get_all_sent_by_doc_id( db_cursor, doc_id) current_sent_list.extend(r_list) current_id_list.extend(id_list) Args = namedtuple('Args', 'ngram hash_size num_workers') args = Args(2, int(8192), 4) ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram, current_sent_list) selected_index, selected_score = ranker.closest_docs(cleaned_claim, k=5) selected_sent_id = [] for ind in selected_index: curent_selected = current_id_list[ind] doc_id, ln = curent_selected.split('(-.-)') # ln = int(ln) # selected_sent_id.append([doc_id, ln]) selected_sent_id.append(doc_id + c_scorer.SENT_LINE + ln) item['predicted_sentids'] = selected_sent_id eval_mode = {'check_sent_id_correct': True, 'standard': False} print(c_scorer.fever_score(d_list, d_list, mode=eval_mode, verbose=False)) out_fname = config.RESULT_PATH / "sent_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname, last_loaded_path=loaded_path)
def tf_idf_rank(args, top_k=5): dev_path = config.PRO_ROOT / \ 'results_old/doc_retri/docretri.basic.nopageview/dev.jsonl' cursor = get_cursor() d_list = read_jsonl(dev_path) d_list_test = d_list for i, item in enumerate(spcl(d_list_test)): all_sent = [] all_ids = [it[0] for it in item['prioritized_docids']] try: for doc_id in all_ids: r_list, _ = get_all_sent_by_doc_id(cursor, doc_id, with_h_links=False) all_sent.append(' '.join(r_list)) ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram, all_sent) except Exception as e: if i - 1 >= 0: print(f'Early quit at {i-1} because of {e}') save_path = config.RESULT_PATH / \ 'doc_retri/docretri.tfidfrank/' \ f'dev_quit_dump_{uuid4()}.json' DocRetrievalExperiment.dump_results(d_list_test[:i], save_path) raise e rank_ind, rank_score = \ ranker.closest_docs(' '.join(item['claim_tokens']), k=100) id_score_dict = {docid: 0 for docid in all_ids} id_score_dict.update({all_ids[ri]: rs \ for ri, rs in zip(rank_ind, rank_score)}) item['prioritized_docids'] = [(k, v) for k, v in id_score_dict.items()] item['predicted_docids'] = \ list(set([k for k, v \ in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0]))][:top_k])) save_path = config.RESULT_PATH / 'doc_retri/docretri.tfidfrank/dev.json' DocRetrievalExperiment.dump_results(d_list_test, save_path)
def expand_from_doc_rule(self): """Current method: if 'prioritized_docids' is shorter than 5, then expand every found ID by extract the document, find some highly-scored (currently tf-idf score) sentences, find links in them and append those documents. Discussions on some variations ------------------------------ 1. Can use other types of sentence similarity score 2. Can (kind of) combine sentence score into priority 3. Match appears first can have higher score propagated """ if not hasattr(self, 'cursor'): self.cursor = fever_db.get_cursor() item = self.item if len(item['prioritized_docids']) < 2: # print(f"Query tf-idf... because length={len(item['prioritized_docids'])}") new_pdocids = copy(item['prioritized_docids']) for docid, priority in item['prioritized_docids']: # print(f"Query tf-idf for {docid}") sent_list, id_list, sent_links = \ fever_db.get_all_sent_by_doc_id(self.cursor, docid, with_h_links=True) # indexes, scores = \ # self.sent_sim.preceding_sent_similarity(sent_list, # item['claim']) indexes, scores = \ self.sent_sim.tfidf_similarity(sent_list, item['claim']) high_tfidf_indexes = indexes[scores > 3.0] if len(high_tfidf_indexes) > 0: all_links = np.array(sent_links)[high_tfidf_indexes] all_links = [ii for i in all_links for ii in i] # flatten links all_links = np.array(all_links) all_links = all_links.reshape(-1, 2)[:, 1] all_links = list(map(reverse_convert_brc, all_links)) new_pdocids.extend([(id_link, 1.0*priority) \ for id_link in all_links]) item['prioritized_docids'] = new_pdocids return self
def fever_app(caller): logger = logging.getLogger() dictConfig({ 'version': 1, 'formatters': {'default': { 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', }}, 'handlers': {'wsgi': { 'class': 'logging.StreamHandler', 'stream': 'ext://sys.stderr', 'formatter': 'default' }}, 'root': { 'level': 'INFO', 'handlers': ['wsgi'] }, 'allennlp': { 'level': 'INFO', 'handlers': ['wsgi'] }, }) logger.info("Set up flask app") nn_doc_retri_threshold = 0.00001 top_k = 100 nn_doc_top_k = 10 sent_prob_for_2doc = 0.1 sent_topk_for_2doc = 5 sentence_retri_1_scale_prob = 0.05 sentence_retri_2_scale_prob = 0.9 sent_retri_2_top_k = 1 enhance_retri_1_scale_prob = -1 def predict_pipeline(claims): # Step 1: Tokenization logger.info('Step 1') logger.info('Start: ' + str(datetime.datetime.now().time())) tokenized_list = [] for idx, claim in enumerate(claims): claim_tok = ' '.join(tok.tokenize(text_clean.normalize(claim["claim"])).words()) item_tokenized = {'id': idx, 'claim': claim_tok} tokenized_list.append(item_tokenized) logger.info('End: ' + str(datetime.datetime.now().time())) # Step 2: 1st Doc retrieval logger.info('Step 2') logger.info('Start: ' + str(datetime.datetime.now().time())) for item in tokenized_list: item_doc_retrieval = item item_rb.first_only_rules(item_doc_retrieval) item_doc_retrieval['predicted_docids'] = list( set([k for k, v in sorted(item_doc_retrieval['prioritized_docids'], key=lambda x: (-x[1], x[0]))][:top_k])) doc_retrieval_list = tokenized_list item_remove_old_rule(doc_retrieval_list) item_resorting(doc_retrieval_list) nn_doc_list = nn_doc_model.pipeline_function_list(doc_retrieval_list, doc_retrieval_model, vocab, cursor) enforce_disabuigation_into_retrieval_result_v2(nn_doc_list, doc_retrieval_list, prob_sh=nn_doc_retri_threshold) logger.info('End: ' + str(datetime.datetime.now().time())) # Step 3: 1st Sentence selection logger.info('Step 3') logger.info('Start: ' + str(datetime.datetime.now().time())) dev_sent_list_1_e0 = simple_nnmodel.pipeline_first_sent_selection_list(tokenized_list, doc_retrieval_list, sent_selector_model, vocab, top_k=nn_doc_top_k, cursor=cursor) dev_sent_list_1_e1 = simple_nnmodel.pipeline_first_sent_selection_list(tokenized_list, doc_retrieval_list, sent_selector_model_1, vocab, top_k=nn_doc_top_k, cursor=cursor) dev_sent_list_1_e2 = simple_nnmodel.pipeline_first_sent_selection_list(tokenized_list, doc_retrieval_list, sent_selector_model_2, vocab, top_k=nn_doc_top_k, cursor=cursor) dev_sent_list_1 = merge_sent_results([dev_sent_list_1_e0, dev_sent_list_1_e1, dev_sent_list_1_e2]) filtered_dev_instance_1_for_doc2 = simi_sampler.threshold_sampler_insure_unique_list(tokenized_list, dev_sent_list_1, sent_prob_for_2doc, top_n=sent_topk_for_2doc) dev_sent_1_list = simi_sampler.threshold_sampler_insure_unique_list(doc_retrieval_list, dev_sent_list_1, sentence_retri_1_scale_prob, top_n=sent_topk_for_2doc) logger.info('End: ' + str(datetime.datetime.now().time())) # Step 4: 2nd Doc retrieval logger.info('Step 4') logger.info('Start: ' + str(datetime.datetime.now().time())) item_rb.preext_sent_dict = {item['id']: item for item in filtered_dev_instance_1_for_doc2} for item in dev_sent_1_list: item_rb.second_only_rules(item) pids = [it[0] for it in item['prioritized_docids']] item['prioritized_docids_aside'] = [it for it in item['prioritized_docids_aside'] if it[0] not in pids] porg = set([k for k, v in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0]))][:top_k]) paside = set([k for k, v in sorted(item['prioritized_docids_aside'], key=lambda x: (-x[1], x[0]))][:top_k]) item['predicted_docids'] = list(porg | paside) item['predicted_docids_origin'] = list(porg) item['predicted_docids_aside'] = list(paside) logger.info('End: ' + str(datetime.datetime.now().time())) # Step 5: 2nd Sentence selection logger.info('Step 5') logger.info('Start: ' + str(datetime.datetime.now().time())) dev_sent_list_2 = get_score_multihop_list(tokenized_list, dev_sent_1_list, sent_selector_2_model, vocab, cursor) logger.info('End: ' + str(datetime.datetime.now().time())) # Step 6: NLI logger.info('Step 6') logger.info('Start: ' + str(datetime.datetime.now().time())) sentence_retri_nli_scale_prob = 0.1 sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique_list(tokenized_list, dev_sent_list_1, sentence_retri_nli_scale_prob, top_n=5) nli_results = mesim_wn_simi_v1_2.pipeline_nli_run_list(tokenized_list, sent_select_results_list_1, [dev_sent_list_1, dev_sent_list_2], nli_model, vocab, dev_fever_data_reader, cursor) delete_unused_evidence(nli_results) nli_results = simi_sampler.threshold_sampler_insure_unique_merge(nli_results, dev_sent_list_2, sentence_retri_2_scale_prob, top_n=5, add_n=sent_retri_2_top_k) delete_unused_evidence(nli_results) nli_results = simi_sampler.threshold_sampler_insure_unique_merge(nli_results, dev_sent_list_1, enhance_retri_1_scale_prob, top_n=100, add_n=100) delete_unused_evidence(nli_results) predictions = [] for final_item in nli_results: sentences = [] for evidence in final_item['predicted_evidence']: sentences.append([evidence[0], evidence[1]]) prediction = final_item['predicted_label'].upper() predictions.append({"predicted_label":prediction,"predicted_evidence":sentences}) logger.info('End: ' + str(datetime.datetime.now().time())) return predictions cursor = fever_db.get_cursor() tok = CoreNLPTokenizer(annotators=['pos', 'lemma']) item_rb = ItemRuleBuilderSpiral(tokenizer=tok, cursor=cursor) p_dict = wn_persistent_api.persistence_load() model_path_dict = { 'sselector': config.DATA_ROOT / 'models/sent_selector', 'sselector_1': config.DATA_ROOT / 'models/sent_selector_1', 'sselector_2': config.DATA_ROOT / 'models/sent_selector_2', 'nn_doc_selector': config.DATA_ROOT / 'models/nn_doc_selector', 'no_doc_nli': config.DATA_ROOT / 'models/nli', } # Preload the NN models device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") doc_retrieval_model = nn_doc_model.Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=160, num_of_class=2) load_model(doc_retrieval_model, model_path_dict['nn_doc_selector'], device) sent_selector_model = simple_nnmodel.Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) load_model(sent_selector_model, model_path_dict['sselector'], device) sent_selector_model_1 = simple_nnmodel.Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) load_model(sent_selector_model_1, model_path_dict['sselector_1'], device) sent_selector_model_2 = simple_nnmodel.Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) load_model(sent_selector_model_2, model_path_dict['sselector_2'], device) sent_selector_2_model = simple_nnmodel.Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=300, num_of_class=2) load_model(sent_selector_2_model, model_path_dict['sselector'], device) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=True, wn_p_dict=p_dict, max_l=420) nli_model = mesim_wn_simi_v1_2.Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size, 1024 + 450 + dev_fever_data_reader.wn_feature_size), rnn_size_out=(450, 450), weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), mlp_d=900, embedding_dim=300, max_l=400) load_model(nli_model, model_path_dict['no_doc_nli'], device) logger.info('Finished loading models.') return caller(predict_pipeline)
def get_additional_list(tokenized_data_file, additional_data_file, item_key='prioritized_docids_aside', top_k=6): """ This method will select all the sentence from upstream doc retrieval and label the correct evident as true :param item_key: The item that specify the additional prioritized document ids. :param tokenized_data_file: Remember this is tokenized data with original format containing 'evidence' :param additional_data_file: This is the data after document retrieval. This file need to contain *"predicted_docids"* field. :return: """ cursor = fever_db.get_cursor() d_list = load_jsonl(tokenized_data_file) additional_d_list = load_jsonl(additional_data_file) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[int(add_item['id'])] = add_item full_data_list = [] for item in tqdm(d_list): doc_ids_p_list = additional_data_dict[int(item['id'])][item_key] doc_ids = list( set([ k for k, v in sorted(doc_ids_p_list, key=lambda x: (-x[1], x[0])) ][:top_k])) # if not pred: # if item['evidence'] is not None: # e_list = utils.check_sentences.check_and_clean_evidence(item) # all_evidence_set = set(itertools.chain.from_iterable([evids.evidences_list for evids in e_list])) # else: # all_evidence_set = None # # print(all_evidence_set) # r_list = [] # id_list = [] # # if all_evidence_set is not None: # for doc_id, ln in all_evidence_set: # _, text, _ = fever_db.get_evidence(cursor, doc_id, ln) # r_list.append(text) # id_list.append(doc_id + '(-.-)' + str(ln)) # # else: # If pred, then reset to not containing ground truth evidence. all_evidence_set = None r_list = [] id_list = [] for doc_id in doc_ids: cur_r_list, cur_id_list = fever_db.get_all_sent_by_doc_id( cursor, doc_id, with_h_links=False) # Merging to data list and removing duplicate for i in range(len(cur_r_list)): if cur_id_list[i] in id_list: continue else: r_list.append(cur_r_list[i]) id_list.append(cur_id_list[i]) assert len(id_list) == len(set(id_list)) # check duplicate assert len(r_list) == len(id_list) zipped_s_id_list = list(zip(r_list, id_list)) # Sort using id # sorted(evidences_set, key=lambda x: (x[0], x[1])) zipped_s_id_list = sorted(zipped_s_id_list, key=lambda x: (x[1][0], x[1][1])) all_sent_list = convert_to_formatted_sent(zipped_s_id_list, all_evidence_set, contain_head=True, id_tokenized=True) cur_id = item['id'] for i, sent_item in enumerate(all_sent_list): sent_item['selection_id'] = str(cur_id) + "<##>" + str( sent_item['sid']) # selection_id is '[item_id<##>[doc_id]<SENT_LINE>[line_number]' sent_item['query'] = item['claim'] full_data_list.append(sent_item) return full_data_list
def select_sent_with_prob_for_eval_list(input_file, additional_file, prob_dict_file, tokenized=False, pipeline=False, is_demo=False): """ This method select sentences with upstream sentence retrieval. :param input_file: This should be the file with 5 sentences selected. :return: """ cursor = fever_db.get_cursor() if isinstance(additional_file, list): additional_d_list = additional_file else: additional_d_list = load_data(additional_file) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item d_list = input_file for item in tqdm(d_list): e_list = additional_data_dict[item['id']]['predicted_sentids'] if not pipeline: assert additional_data_dict[item['id']]['label'] == item['label'] assert additional_data_dict[ item['id']]['verifiable'] == item['verifiable'] assert additional_data_dict[item['id']]['id'] == item['id'] pred_evidence_list = [] for i, cur_e in enumerate(e_list): doc_id = cur_e.split(c_scorer.SENT_LINE)[0] ln = int(cur_e.split( c_scorer.SENT_LINE)[1]) # Important changes Bugs: July 21 pred_evidence_list.append((doc_id, ln)) pred_evidence = check_sentences.Evidences(pred_evidence_list) evidence_text_list = evidence_list_to_text_list(cursor, pred_evidence, contain_head=True, id_tokenized=tokenized) evidences = sorted(pred_evidence, key=lambda x: (x[0], x[1])) item_id = int(item['id']) evidence_text_list_with_prob = [] for text, (doc_id, ln) in zip(evidence_text_list, evidences): ssid = (item_id, doc_id, int(ln)) if ssid not in prob_dict_file: print("Some sentence pair don't have 'prob'.") prob = 0.5 else: prob = prob_dict_file[ssid]['prob'] assert item['claim'] == prob_dict_file[ssid]['claim'] evidence_text_list_with_prob.append((text, prob)) if tokenized: pass else: item['claim'] = ' '.join(easy_tokenize(item['claim'])) item['evid'] = evidence_text_list_with_prob item['predicted_evidence'] = convert_evidence2scoring_format(e_list) item['predicted_sentids'] = e_list # This change need to be saved. # item['predicted_label'] = additional_data_dict[item['id']]['label'] return d_list
def adv_simi_sample_with_prob_v1_1(input_file, additional_file, prob_dict_file, tokenized=False): cursor = fever_db.get_cursor() d_list = load_data(input_file) if isinstance(additional_file, list): additional_d_list = additional_file else: additional_d_list = load_data(additional_file) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item sampled_data_list = [] count = 0 for item in tqdm(d_list): # e_list = check_sentences.check_and_clean_evidence(item) sampled_e_list, flags = sample_additional_data_for_item_v1_1( item, additional_data_dict) # print(flags) for i, (sampled_evidence, flag) in enumerate(zip(sampled_e_list, flags)): # Do not copy, might change in the future for error analysis # new_item = copy.deepcopy(item) new_item = dict() # print(new_item['claim']) # print(e_list) # print(sampled_evidence) # print(flag) evidence_text_list = evidence_list_to_text_list( cursor, sampled_evidence, contain_head=True, id_tokenized=tokenized) evidences = sorted(sampled_evidence, key=lambda x: (x[0], x[1])) item_id = int(item['id']) evidence_text_list_with_prob = [] for text, (doc_id, ln) in zip(evidence_text_list, evidences): ssid = (int(item_id), doc_id, int(ln)) if ssid not in prob_dict_file: count += 1 print("Some sentence pair don't have 'prob'.") prob = 0.5 else: prob = prob_dict_file[ssid]['prob'] assert item['claim'] == prob_dict_file[ssid]['claim'] evidence_text_list_with_prob.append((text, prob)) new_item['id'] = str(item['id']) + '#' + str(i) if tokenized: new_item['claim'] = item['claim'] else: new_item['claim'] = ' '.join(easy_tokenize(item['claim'])) new_item['evid'] = evidence_text_list_with_prob new_item['verifiable'] = item['verifiable'] new_item['label'] = item['label'] # print("C:", new_item['claim']) # print("E:", new_item['evid']) # print("L:", new_item['label']) # print() sampled_data_list.append(new_item) cursor.close() print(count) return sampled_data_list
d_list = common.load_jsonl( "/home/easonnie/projects/FunEver/results/doc_retri_bls/docretri.basic.nopageview/dev.jsonl" ) # d_list = common.load_jsonl("/Users/Eason/RA/FunEver/results/doc_retri_bls/docretri.pageview/dev.jsonl") # filtered_list = [] # for item in d_list: # if filter_contain_parenthese(item): # if filter_contain_parenthese_valid(item): # filtered_list.append(item) # d_list = filtered_list pos_count = 0 neg_count = 0 cursor = fever_db.get_cursor() p_list, n_list = [], [] # inference_list = [] # train_list = sample_disamb_training(d_list, cursor, sample_ratio=1.0) # print("Length:", len(train_list)) # for item in d_list: # positive_list, negative_list = disabuigation_training_build(item, cursor, contain_first_sentence=True) # p_list.extend(positive_list) # n_list.extend(negative_list) # for item in d_list: # inference_list.extend(inference_build(item, cursor, contain_first_sentence=False)) # inference_list = sample_disamb_inference(d_list, cursor) train_list = sample_disamb_training_v0(d_list, cursor, only_found=False)
def train_fever_v1(): num_epoch = 10 seed = 12 batch_size = 128 dev_batch_size = 128 # experiment_name = "simple_nn_doc_first_sent" experiment_name = "simple_nn_doc" lazy = True torch.manual_seed(seed) contain_first_sentence = False pn_ratio = 1.0 # keep_neg_sample_prob = 0.4 # sample_prob_decay = 0.05 dev_upstream_file = config.RESULT_PATH / "doc_retri_bls/docretri.basic.nopageview/dev.jsonl" train_upstream_file = config.RESULT_PATH / "doc_retri_bls/docretri.basic.nopageview/train.jsonl" dev_data_list = common.load_jsonl(dev_upstream_file) # Prepare Data token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='tokens'), # This is the raw tokens 'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters') # This is the elmo_characters } train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False) dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180) cursor = fever_db.get_cursor() complete_upstream_dev_data = disamb.sample_disamb_inference(common.load_jsonl(dev_upstream_file), cursor, contain_first_sentence=contain_first_sentence) print("Dev size:", len(complete_upstream_dev_data)) dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data) # Load Vocabulary biterator = BasicIterator(batch_size=batch_size) dev_biterator = BasicIterator(batch_size=dev_batch_size) vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic") # THis is important vocab.add_token_to_namespace("true", namespace="selection_labels") vocab.add_token_to_namespace("false", namespace="selection_labels") vocab.add_token_to_namespace("hidden", namespace="selection_labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels') # Label value vocab.get_index_to_token_vocabulary('selection_labels') print(vocab.get_token_to_index_vocabulary('selection_labels')) print(vocab.get_vocab_size('tokens')) biterator.index_with(vocab) dev_biterator.index_with(vocab) # exit(0) # Build Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0) device_num = -1 if device.type == 'cpu' else 0 model = Model(weight=weight_dict['glove.840B.300d'], vocab_size=vocab.get_vocab_size('tokens'), embedding_dim=300, max_l=160, num_of_class=2) model.display() model.to(device) # Create Log File file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # Save source code end. best_dev = -1 iteration = 0 start_lr = 0.0002 optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr) criterion = nn.CrossEntropyLoss() for i_epoch in range(num_epoch): print("Resampling...") # Resampling complete_upstream_train_data = disamb.sample_disamb_training_v0(common.load_jsonl(train_upstream_file), cursor, pn_ratio, contain_first_sentence) print("Sample Prob.:", pn_ratio) print("Sampled_length:", len(complete_upstream_train_data)) sampled_train_instances = train_fever_data_reader.read(complete_upstream_train_data) train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1, cuda_device=device_num) for i, batch in tqdm(enumerate(train_iter)): model.train() out = model(batch) y = batch['selection_label'] loss = criterion(out, y) # No decay optimizer.zero_grad() loss.backward() optimizer.step() iteration += 1 if i_epoch <= 5: mod = 1000 else: mod = 500 if iteration % mod == 0: eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) disamb.enforce_disabuigation_into_retrieval_result_v0(complete_upstream_dev_data, dev_data_list) oracle_score, pr, rec, f1 = c_scorer.fever_doc_only(dev_data_list, dev_data_list, max_evidence=5) print(f"Dev(raw_acc/pr/rec/f1):{oracle_score}/{pr}/{rec}/{f1}") print("Strict score:", oracle_score) print(f"Eval Tracking score:", f"{oracle_score}") need_save = False if oracle_score > best_dev: best_dev = oracle_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_' f'(tra_score:{oracle_score}|pr:{pr}|rec:{rec}|f1:{f1})' ) torch.save(model.state_dict(), save_path) # print("Epoch Evaluation...") eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num) complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data) disamb.enforce_disabuigation_into_retrieval_result_v0(complete_upstream_dev_data, dev_data_list) oracle_score, pr, rec, f1 = c_scorer.fever_doc_only(dev_data_list, dev_data_list, max_evidence=5) print(f"Dev(raw_acc/pr/rec/f1):{oracle_score}/{pr}/{rec}/{f1}") print("Strict score:", oracle_score) print(f"Eval Tracking score:", f"{oracle_score}") need_save = False if oracle_score > best_dev: best_dev = oracle_score need_save = True if need_save: save_path = os.path.join( file_path_prefix, f'i({iteration})_epoch({i_epoch})_e' f'(tra_score:{oracle_score}|pr:{pr}|rec:{rec}|f1:{f1})' ) torch.save(model.state_dict(), save_path)
def get_full_list(tokenized_data_file, additional_data_file, pred=False, top_k=None): """ This method will select all the sentence from upstream doc retrieval and label the correct evident as true :param tokenized_data_file: Remember this is tokenized data with original format containing 'evidence' :param additional_data_file: This is the data after document retrieval. This file need to contain *"predicted_docids"* field. :return: """ cursor = fever_db.get_cursor() d_list = load_jsonl(tokenized_data_file) if not isinstance(additional_data_file, list): additional_d_list = load_jsonl(additional_data_file) else: additional_d_list = additional_data_file if top_k is not None: print("Upstream document number truncate to:", top_k) trucate_item(additional_d_list, top_k=top_k) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item full_data_list = [] for item in tqdm(d_list): doc_ids = additional_data_dict[item['id']]["predicted_docids"] if not pred: if item['evidence'] is not None: e_list = utils.check_sentences.check_and_clean_evidence(item) all_evidence_set = set( itertools.chain.from_iterable( [evids.evidences_list for evids in e_list])) else: all_evidence_set = None # print(all_evidence_set) r_list = [] id_list = [] if all_evidence_set is not None: for doc_id, ln in all_evidence_set: _, text, _ = fever_db.get_evidence(cursor, doc_id, ln) r_list.append(text) id_list.append(doc_id + '(-.-)' + str(ln)) else: # If pred, then reset to not containing ground truth evidence. all_evidence_set = None r_list = [] id_list = [] for doc_id in doc_ids: cur_r_list, cur_id_list = fever_db.get_all_sent_by_doc_id( cursor, doc_id, with_h_links=False) # Merging to data list and removing duplicate for i in range(len(cur_r_list)): if cur_id_list[i] in id_list: continue else: r_list.append(cur_r_list[i]) id_list.append(cur_id_list[i]) assert len(id_list) == len(set(id_list)) # check duplicate assert len(r_list) == len(id_list) zipped_s_id_list = list(zip(r_list, id_list)) # Sort using id # sorted(evidences_set, key=lambda x: (x[0], x[1])) zipped_s_id_list = sorted(zipped_s_id_list, key=lambda x: (x[1][0], x[1][1])) all_sent_list = convert_to_formatted_sent(zipped_s_id_list, all_evidence_set, contain_head=True, id_tokenized=True) cur_id = item['id'] for i, sent_item in enumerate(all_sent_list): sent_item['selection_id'] = str(cur_id) + "<##>" + str( sent_item['sid']) sent_item['query'] = item['claim'] if 'label' in item.keys(): sent_item['claim_label'] = item['label'] full_data_list.append(sent_item) return full_data_list