def __init__(self, base_path, train_file, dev_file, test_file, fasttext_path, num_negatives, h_max_length, s_max_length, random_seed, reserve_embed=False, db_filepath="data/fever/fever.db", joint_learning=False): self.random_seed = random_seed self.base_path = base_path self.train_file = train_file self.dev_file = dev_file self.test_file = test_file self.fasttext_path = fasttext_path self.num_negatives = num_negatives self.h_max_length = h_max_length self.s_max_length = s_max_length self.db_filepath = os.path.join(self.base_path, db_filepath) self.db = FeverDocDB(self.db_filepath) self.reserve_embed = reserve_embed self.data_pipeline() if joint_learning: self.data_for_joint()
def show_predictions(db_filename, predictions): """ display claim and predicted sentences which doesn't include at least one evidence set :param db_filename: :param predictions: :return: """ db = FeverDocDB(db_filename) for line in predictions: if line['label'].upper() != "NOT ENOUGH INFO": macro_rec = evidence_macro_recall(line) if macro_rec[0] == 1.0: continue pages = set([page for page, _ in line['predicted_evidence']]) evidence_set = set([(page, line_num) for page, line_num in line['predicted_evidence']]) p_lines = [] for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: # print(page) continue doc_lines = [doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n")] p_lines.extend(zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) print("claim: {}".format(line['claim'])) print(evidence_set) count = 0 for doc_line in p_lines: if (doc_line[1], doc_line[2]) in evidence_set: print("the {}st evidence: {}".format(count, doc_line[0])) count += 1
def __init__(self, embedding_path, train_file, dev_file, test_file, fasttext_path, num_negatives, h_max_length, s_max_length, random_seed, reserve_embed=False, db_filepath="data/fever/fever.db"): self.random_seed = random_seed self.embedding_path = embedding_path self.train_file = train_file self.dev_file = dev_file self.test_file = test_file self.fasttext_path = fasttext_path self.num_negatives = num_negatives self.h_max_length = h_max_length self.s_max_length = s_max_length self.db_filepath = db_filepath self.db = FeverDocDB(self.db_filepath) self.reserve_embed = reserve_embed self.data_pipeline()
def __init__(self, database_path, add_claim=False, k_wiki_results=None): self.db = FeverDocDB(database_path) self.add_claim = add_claim self.k_wiki_results = k_wiki_results self.proter_stemm = nltk.PorterStemmer() self.tokenizer = nltk.word_tokenize self.predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")
def in_doc_sampling(db_filename, datapath, num_sample=1): db = FeverDocDB(db_filename) jlr = JSONLineReader() X = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) for line in tqdm(lines): count += 1 pos_pairs = [] # count1 += 1 if line['label'].upper() == "NOT ENOUGH INFO": continue neg_sents = [] claim = line['claim'] pos_set = set() for evidence_set in line['evidence']: pos_sent = get_whole_evidence(evidence_set, db) if pos_sent in pos_set: continue pos_set.add(pos_sent) p_lines = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) page_set = set([evidence[0] for evidence in evidence_set]) for page in page_set: doc_lines = db.get_doc_lines(page) p_lines.extend(get_valid_texts(doc_lines, page)) for doc_line in p_lines: if (doc_line[1], doc_line[2]) not in evidence_set: neg_sents.append(doc_line[0]) num_sampling = num_sample if len(neg_sents) < num_sampling: num_sampling = len(neg_sents) # print(neg_sents) if num_sampling == 0: continue else: for pos_sent in pos_set: samples = random.sample(neg_sents, num_sampling) for sample in samples: if not sample: continue X.append((claim, pos_sent, sample)) if count % 1000 == 0: print("claim:{} ,evidence :{} sample:{}".format( claim, pos_sent, sample)) return X
def load_words(embedding_file, train_datapath, test_path, db_filename, num_sample, sampled_path): words = set() def _insert(iterable): for w in iterable: w = Dictionary.normalize(w) if valid_words and w not in valid_words: continue words.add(w) valid_words = index_embedding_words(embedding_file) X_claim, X_sents, y = load_generate_samples(db_filename, train_datapath, num_sample, sampled_path) X_claim = set(X_claim) for claim in X_claim: words = nltk.word_tokenize(claim) _insert(words) for sent in X_sents: words = simple_tokenizer(sent) _insert(words) with open(test_path, "r") as f: jlr = JSONLineReader() db = FeverDocDB(db_filename) lines = jlr.process(f) for line in lines: claim = line['claim'] words = nltk.word_tokenize(claim) _insert(words) evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) pages.update(evidence[0] for evidence in evidence_set) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] doc_lines = [doc_line for doc_line in doc_lines if doc_line] for doc_line in doc_lines: words = simple_tokenizer(doc_line) _insert(words) return words
def test_data_4_siamese(db_filename, dataset_path): db = FeverDocDB(db_filename) jlr = JSONLineReader() X_claims = [] X_sents = [] all_sents_id = [] with open(dataset_path, "r") as f: lines = jlr.process(f) # lines = lines[:1000] for line in tqdm(lines): claims = [] sents = [] sents_indexes = [] p_lines = [] claim = line['claim'] pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] p_lines.extend( zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) for doc_line in p_lines: if not doc_line[0]: continue else: claims.append(claim) sents.append(doc_line[0]) sents_indexes.append((doc_line[1], doc_line[2])) X_claims.append(claims) X_sents.append(sents) all_sents_id.append(sents_indexes) # print(len(X_claims)) # print(len(X_sents)) # print(len(all_sents_id)) # X_claims_indexes, X_sents_indexes = [], [] # for idx, claims in enumerate(X_claims): # claims_index, sents_index = data_transformer(claims, X_sents[idx], word_dict) # X_claims_indexes.append(claims_index) # X_sents_indexes.append(sents_index) return X_claims, X_sents, all_sents_id
def __init__(self, fasttext_path, h_max_length, s_max_length, reserve_embed=False, model_location=os.getcwd(), db_filepath="data/fever/fever.db"): self.model_location = model_location self.fasttext_path = fasttext_path self.h_max_length = h_max_length self.s_max_length = s_max_length self.db = FeverDocDB(db_filepath) self.reserve_embed = reserve_embed
def _create_token_set_of_db(db): logger.debug("start creating token set for DB...") if type(db) == str: db = FeverDocDB(db) _token_set = set() for doc_id in tqdm(db.get_non_empty_doc_ids()): doc_lines = db.get_doc_lines(doc_id) for line in doc_lines: tokens = tokenize(clean_text(line)) for token in tokens: if token.lower() in _token_set: continue _token_set.add(token.lower()) return _token_set
def number_feature(data_set_path: str, db_path: str, max_sent_num: int): from common.dataset.reader import JSONLineReader db = FeverDocDB(db_path) jlr = JSONLineReader() lines = jlr.read(data_set_path) num_feat = np.zeros([len(lines), max_sent_num, 3], dtype=np.int32) for i, line in enumerate(lines): claim_text = line['claim'] claim_tokens = tokenize(claim_text) all_nums = set() for token in claim_tokens: if is_token_numeric(token): all_nums.add(float(token)) for j, evidence in enumerate(line['predicted_evidence']): if j >= max_sent_num: break page, line_num = evidence[-2], evidence[-1] all_evidence_nums = [] evidence_text = evidence_num_to_text(db, page, line_num) evidence_tokens = tokenize(evidence_text) for token in evidence_tokens: if is_token_numeric(token): all_evidence_nums.append(float(token)) has_num = len(all_evidence_nums) > 0 has_identical_num = any(n in all_nums for n in all_evidence_nums) has_different_num = any(n not in all_nums for n in all_evidence_nums) num_feat[i][j][0], num_feat[i][j][1], num_feat[i][j][ 2] = _interprete_num_result(has_num, has_identical_num, has_different_num) return num_feat
def __init__(self,base_path,train_file,dev_file,test_file,num_negatives,h_max_length,s_max_length,random_seed=100,db_filepath="data/fever/fever.db"): self.random_seed = random_seed self.base_path = base_path self.train_file = train_file self.dev_file = dev_file self.test_file = test_file self.num_negatives = num_negatives self.h_max_length = h_max_length self.s_max_length = s_max_length self.db_filepath = os.path.join(self.base_path,db_filepath) self.db = FeverDocDB(self.db_filepath) self.data_pipeline()
def _create_db_vocab_idx(db, _global_dict): # logger = LogHelper.get_logger("_create_db_vocab_idx") logger.debug("start creating vocab indices for DB...") if type(db) == str: db = FeverDocDB(db) _vocab_idx = {} for doc_id in tqdm(db.get_non_empty_doc_ids()): doc_lines = db.get_doc_lines(doc_id) for line in doc_lines: tokens = tokenize(clean_text(line)) for token in tokens: if token.lower() in _vocab_idx: continue if token.lower() in _global_dict: _vocab_idx[token.lower()] = _global_dict[token.lower()] _vocab_idx = sorted(list(_vocab_idx.values())) return _vocab_idx
def dev_processing(db_filename, lines): db = FeverDocDB(db_filename) claims = [] list_sents = [] labels = [] for line in tqdm(lines): if line['label'].upper() == "NOT ENOUGH INFO": continue claims.append(line['claim']) sents = [] label = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page[0] for page in line['predicted_pages'] if page[0] is not None ] for page, num in evidence_set: pages.append(page) pages = set(pages) p_lines = [] for page in pages: doc_lines = db.get_doc_lines(page) p_lines.extend(get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue if (doc_line[1], doc_line[2]) in evidence_set: sents.append(doc_line[0]) label.append(1) else: sents.append(doc_line[0]) label.append(0) if len(claims) == 0 or len(list_sents) == 0 or len(labels) == 0: continue list_sents.append(sents) labels.append(label) return claims, list_sents, labels
def from_params(cls, params: Params) -> 'FEVERSentenceReader': claim_tokenizer = Tokenizer.from_params(params.pop('claim_tokenizer', {})) wiki_tokenizer = Tokenizer.from_params(params.pop('wiki_tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) db = FeverDocDB(params.pop("db_path","data/fever/fever.db")) params.assert_empty(cls.__name__) return FEVERSentenceReader(db=db, claim_tokenizer=claim_tokenizer, wiki_tokenizer=wiki_tokenizer, token_indexers=token_indexers)
def eval_da(dataset_to_work_on, args, operation, mithun_logger): LogHelper.setup() LogHelper.get_logger("allennlp.training.trainer") LogHelper.get_logger(__name__) params = Params.from_file(args.param_path, args.overrides) uofa_params = params.pop('uofa_params', {}) path_to_saved_db = uofa_params.pop("path_to_saved_db") db = FeverDocDB(path_to_saved_db) mithun_logger.info("inside main function going to call eval on " + str(dataset_to_work_on)) mithun_logger.info("path_to_pyproc_annotated_data_folder " + str(path_to_pyproc_annotated_data_folder)) mithun_logger.info("value of name_of_trained_model_to_use: " + str(name_of_trained_model_to_use)) mithun_logger.info("value of dataset_to_work_on: " + str(dataset_to_work_on)) if (dataset_to_work_on == "fnc"): fever_dataset_details = uofa_params.pop('fever_dataset_details', {}) dev_partition_details = fever_dataset_details.pop( 'dev_partition_details', {}) name_of_trained_model_to_use = dev_partition_details.pop( 'name_of_trained_model_to_use', {}) path_to_pyproc_annotated_data_folder = dev_partition_details.pop( 'path_to_pyproc_annotated_data_folder', {}) debug_mode = uofa_params.pop('debug_mode', {}) path_to_trained_models_folder = uofa_params.pop( 'path_to_trained_models_folder', {}) path_to_fnc_annotated_data = dev_partition_details.pop( 'path_to_pyproc_annotated_data_folder', {}) eval_model_fnc_data(db, args, mithun_logger, name_of_trained_model_to_use, path_to_trained_models_folder, cuda_device, operation, path_to_fnc_annotated_data) elif (dataset_to_work_on == "fever"): fever_dataset_details = uofa_params.pop('fever_dataset_details', {}) dev_partition_details = fever_dataset_details.pop( 'dev_partition_details', {}) name_of_trained_model_to_use = dev_partition_details.pop( 'name_of_trained_model_to_use', {}) path_to_pyproc_annotated_data_folder = dev_partition_details.pop( 'path_to_pyproc_annotated_data_folder', {}) debug_mode = uofa_params.pop('debug_mode', {}) path_to_trained_models_folder = uofa_params.pop( 'path_to_trained_models_folder', {}) eval_model(db, args, mithun_logger, path_to_trained_models_folder, name_of_trained_model_to_use)
def test_processing(db_filename, lines): db = FeverDocDB(db_filename) claims = [] list_sents = [] sents_indexes = [] for line in tqdm(lines): # if line['label'].upper() == "NOT ENOUGH INFO": # continue claims.append(line['claim']) sents = [] sents_index = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = set([ page[0] for page in line['predicted_pages'] if page[0] is not None ]) if len(pages) == 0: pages.add("Michael_Hutchence") p_lines = [] for page in pages: doc_lines = db.get_doc_lines(page) p_lines.extend(get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue if (doc_line[1], doc_line[2]) in evidence_set: sents.append(doc_line[0]) else: sents.append(doc_line[0]) sents_index.append((doc_line[1], doc_line[2])) list_sents.append(sents) sents_indexes.append(sents_index) return claims, list_sents, sents_indexes
def dev_processing(db_filename, datapath): db = FeverDocDB(db_filename) jlr = JSONLineReader() devs = [] all_indexes = [] with open(datapath, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] pages = set() pages.update(page[0] for page in line['predicted_pages']) if len(pages) == 0: pages.add("Michael_Hutchence") claim = line['claim'] p_lines = [] for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue p_lines.extend(get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((claim, doc_line[0])) indexes.append((doc_line[1], doc_line[2])) # print(len(dev)) if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes
def from_params(cls, params: Params) -> 'FEVERReader': claim_tokenizer = Tokenizer.from_params( params.pop('claim_tokenizer', {})) wiki_tokenizer = Tokenizer.from_params(params.pop( 'wiki_tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) sentence_level = params.pop("sentence_level", False) is_snopes = params.pop("is_snopes", False) if is_snopes: db = SnopesDocDB(params.pop("db_path", "dataset/snopes.pages.json")) else: db = FeverDocDB(params.pop("db_path", "data/fever.db")) params.assert_empty(cls.__name__) return FEVERReader(db=db, sentence_level=sentence_level, claim_tokenizer=claim_tokenizer, wiki_tokenizer=wiki_tokenizer, token_indexers=token_indexers)
LogHelper.get_logger("allennlp.training.trainer") LogHelper.get_logger(__name__) parser = argparse.ArgumentParser() parser.add_argument('db', type=str, help='/path/to/saved/db.db') parser.add_argument( 'param_path', type=str, help='path to parameter file describing the model to be trained') parser.add_argument("logdir", type=str) parser.add_argument("--filtering", type=str, default=None) parser.add_argument("--cuda-device", type=int, default=None, help='id of GPU to use (if any)') parser.add_argument( '-o', '--overrides', type=str, default="", help='a HOCON structure used to override the experiment configuration') args = parser.parse_args() db = FeverDocDB(args.db) params = Params.from_file(args.param_path, args.overrides) train_model(db, params, args.cuda_device, args.logdir, args.filtering)
def read_data_set_from_jsonl(file_path: str, db: Union[str, FeverDocDB], predicted: bool = True, num_sentences=None, is_snopes=False): logger = LogHelper.get_logger("read_data_set_from_jsonl") if not is_snopes: if type(db) is str: db = FeverDocDB(db) else: with open(db) as f: db = json.load(f) with open(file_path, 'r') as f: claims = [] evidences = [] paths = [] labels = [] ids = [] for line in tqdm(f): json_obj = json.loads(line) if predicted: evidences_texts = [] if 'predicted_evidence' in json_obj: _evidences = json_obj['predicted_evidence'] elif 'predicted_sentences' in json_obj: _evidences = json_obj['predicted_sentences'] else: _evidences = [] if len(_evidences) > 0: for sent in _evidences: page, line_num = sent[-2], sent[-1] page_title = page.replace("_", " ") evidences_texts.append( # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes))) clean_text( evidence_num_to_text(db, page, line_num, is_snopes))) else: evidences_texts = set() _evidences = json_obj['evidence'] for evidence in _evidences: for sent in evidence: page, line_num = sent[-2], sent[-1] page_title = page.replace("_", " ") evidences_texts.add( # page_title + " # " + clean_text(evidence_num_to_text(db, page, line_num, is_snopes))) clean_text( evidence_num_to_text(db, page, line_num, is_snopes))) evidences_texts = list(evidences_texts) if len(evidences_texts) == 0: continue if num_sentences is not None: if len(evidences_texts) > num_sentences: evidences_texts = evidences_texts[:num_sentences] claims.append(clean_text(json_obj['claim'])) if 'label' in json_obj: labels.append(label_dict.index(json_obj['label'])) evidences.append(evidences_texts) if 'paths' in json_obj: paths_from_sent_to_claim = [ 1.0 if p else 0.0 for p in json_obj['paths'] ] if num_sentences is not None and num_sentences > len( paths_from_sent_to_claim): paths_from_sent_to_claim += [0.0] * ( num_sentences - len(paths_from_sent_to_claim)) paths.append(paths_from_sent_to_claim) ids.append(json_obj['id']) datas = {'h': claims, 'b': evidences, 'id': ids} if paths: datas['paths'] = paths return datas, labels
from retrieval.fever_doc_db import FeverDocDB from retrieval.sentence import FEVERSentenceRelatednessFormatter, FEVERSentenceFormatter, FEVERSentenceTextFormatter from scripts.retrieval.sentence.mlp_train import RelatedLabelSchema nlp = spacy.load("en", create_pipeline=wmd.WMD.create_spacy_pipeline) def wmd_sim(claim, lines): cl = nlp(claim) scores = [] for line in lines: scores.append(cl.similarity(nlp(line))) return scores db = FeverDocDB("data/fever/fever.db") idx = set(db.get_doc_ids()) jlr = JSONLineReader() formatter = FEVERSentenceTextFormatter(idx, db, RelatedLabelSchema()) dev_ds = DataSet(file="data/fever-data/dev.jsonl", reader=jlr, formatter=formatter) dev_ds.read() def doc_lines(db, doc): lines = db.get_doc_lines(doc) return [ line.split("\t")[1] if len(line.split("\t")) > 1 else ""
class Data(object): def __init__(self, embedding_path, train_file, dev_file, test_file, fasttext_path, num_negatives, h_max_length, s_max_length, random_seed, reserve_embed=False, db_filepath="data/fever/fever.db"): self.random_seed = random_seed self.embedding_path = embedding_path self.train_file = train_file self.dev_file = dev_file self.test_file = test_file self.fasttext_path = fasttext_path self.num_negatives = num_negatives self.h_max_length = h_max_length self.s_max_length = s_max_length self.db_filepath = db_filepath self.db = FeverDocDB(self.db_filepath) self.reserve_embed = reserve_embed self.data_pipeline() def data_pipeline(self): np.random.seed(self.random_seed) random.seed(self.random_seed) # create diretory to store sampling data and processed data # store_dir = "data.h{}.s{}.seed{}".format(self.h_max_length, self.s_max_length, self.random_seed) # self.absou_dir = os.path.join(base_dir, store_dir) os.makedirs(self.embedding_path, exist_ok=True) train_data_path = os.path.join(self.embedding_path, "train_sample.p") X_train = self.train_data_loader(train_data_path, self.train_file, num_samples=self.num_negatives) dev_datapath = os.path.join(self.embedding_path, "dev_data.p") devs, self.dev_labels = self.dev_data_loader(dev_datapath, self.dev_file) if self.test_file is None: self.test_file = self.dev_file test_datapath = os.path.join(self.embedding_path, "test_data.p") tests, self.test_location_indexes = self.predict_data_loader( test_datapath, self.test_file) words_dict_path = os.path.join(self.embedding_path, "words_dict.p") if os.path.exists(words_dict_path): with open(words_dict_path, "rb") as f: self.word_dict = pickle.load(f) else: self.word_dict = self.get_complete_words(words_dict_path, X_train, devs, tests) self.iword_dict = self.inverse_word_dict(self.word_dict) train_indexes_path = os.path.join(self.embedding_path, "train_indexes.p") self.X_train_indexes = self.train_indexes_loader( train_indexes_path, X_train) dev_indexes_path = os.path.join(self.embedding_path, "dev_indexes.p") self.dev_indexes = self.predict_indexes_loader(dev_indexes_path, devs) test_indexes_path = os.path.join(self.embedding_path, "test_indexes.p") self.test_indexes = self.predict_indexes_loader( test_indexes_path, tests) embed_dict = self.load_fasttext(self.iword_dict) print("embed_dict size {}".format(len(embed_dict))) _PAD_ = len(self.word_dict) self.word_dict[_PAD_] = '[PAD]' self.iword_dict['[PAD]'] = _PAD_ self.embed = self.embed_to_numpy(embed_dict) return self def get_whole_evidence(self, evidence_set, db): pos_sents = [] for evidence in evidence_set: page = evidence[2] doc_lines = db.get_doc_lines(page) doc_lines = self.get_valid_texts(doc_lines, page) for doc_line in doc_lines: if doc_line[2] == evidence[3]: pos_sents.append(doc_line[0]) pos_sent = ' '.join(pos_sents) return pos_sent def get_valid_texts(self, lines, page): if not lines: return [] doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in lines.split("\n") ] doc_lines = list( zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) return doc_lines def sampling(self, datapath, num_sample=1): jlr = JSONLineReader() X = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) for line in tqdm(lines): count += 1 pos_pairs = [] # count1 += 1 if line['label'].upper() == "NOT ENOUGH INFO": continue neg_sents = [] claim = line['claim'] pos_set = set() for evidence_set in line['evidence']: pos_sent = self.get_whole_evidence(evidence_set, self.db) if pos_sent in pos_set: continue pos_set.add(pos_sent) p_lines = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if (doc_line[1], doc_line[2]) not in evidence_set: neg_sents.append(doc_line[0]) num_sampling = num_sample if len(neg_sents) < num_sampling: num_sampling = len(neg_sents) # print(neg_sents) if num_sampling == 0: continue else: for pos_sent in pos_set: samples = random.sample(neg_sents, num_sampling) for sample in samples: if not sample: continue X.append((claim, pos_sent, sample)) # if count % 1000 == 0: # print("claim:{} ,evidence :{} sample:{}".format(claim, pos_sent, sample)) return X def predict_processing(self, datapath): jlr = JSONLineReader() devs = [] all_indexes = [] with open(datapath, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] pages = set() # pages = line['predicted_pages'] pages.update(page for page in line['predicted_pages']) # if len(pages) == 0: # pages.add("Michael_Hutchence") claim = line['claim'] p_lines = [] for page in pages: doc_lines = self.db.get_doc_lines(page) if not doc_lines: continue p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((claim, doc_line[0])) indexes.append((doc_line[1], doc_line[2])) # print(len(dev)) if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes def dev_processing(self, data_path): jlr = JSONLineReader() with open(data_path, "r") as f: lines = jlr.process(f) devs = [] labels = [] for line in tqdm(lines): dev = [] label = [] if line['label'].upper() == "NOT ENOUGH INFO": continue evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page, num in evidence_set: pages.append(page) pages = set(pages) p_lines = [] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((line['claim'], doc_line[0])) if (doc_line[1], doc_line[2]) in evidence_set: label.append(1) else: label.append(0) if len(dev) == 0 or len(label) == 0: continue devs.append(dev) labels.append(label) return devs, labels def train_data_loader(self, train_sampled_path, data_path, num_samples=1): if os.path.exists(train_sampled_path): with open(train_sampled_path, 'rb') as f: X = pickle.load(f) else: X = self.sampling(data_path, num_samples) with open(train_sampled_path, 'wb') as f: pickle.dump(X, f) return X def dev_data_loader(self, dev_data_path, data_path): if os.path.exists(dev_data_path): with open(dev_data_path, "rb") as f: data = pickle.load(f) devs, labels = zip(*data) else: devs, labels = self.dev_processing(data_path) data = list(zip(devs, labels)) with open(dev_data_path, 'wb') as f: pickle.dump(data, f) return devs, labels def predict_data_loader(self, predict_data_path, data_path): if os.path.exists(predict_data_path): print(predict_data_path) with open(predict_data_path, "rb") as f: data = pickle.load(f) devs, location_indexes = zip(*data) else: devs, location_indexes = self.predict_processing(data_path) data = list(zip(devs, location_indexes)) with open(predict_data_path, 'wb') as f: pickle.dump(data, f) return devs, location_indexes def sent_processing(self, sent): sent = sent.replace('\n', '') sent = sent.replace('-', ' ') sent = sent.replace('/', ' ') return sent def nltk_tokenizer(self, sent): # sent = sent_processing(sent) return nltk.word_tokenize(sent) def get_words(self, claims, sents): words = set() for claim in claims: for idx, word in enumerate(self.nltk_tokenizer(claim)): if idx >= self.h_max_length: break words.add(word.lower()) for sent in sents: for idx, word in enumerate(self.nltk_tokenizer(sent)): if idx >= self.s_max_length: break words.add(word.lower()) return words def get_train_words(self, X): claims = set() sents = [] for claim, pos, neg in X: claims.add(claim) sents.append(pos) sents.append(neg) train_words = self.get_words(claims, sents) print("training words processing done!") return train_words def get_predict_words(self, devs): dev_words = set() # nlp = StanfordCoreNLP(corenlp_path) for dev in tqdm(devs): claims = set() sents = [] for pair in dev: claims.add(pair[0]) sents.append(pair[1]) dev_tokens = self.get_words(claims, sents) dev_words.update(dev_tokens) print("dev_words processing done!") return dev_words def word_2_dict(self, words): word_dict = {} for idx, word in enumerate(words): word = word.replace('\n', '') word = word.replace('\t', '') word_dict[idx] = word return word_dict def inverse_word_dict(self, word_dict): iword_dict = {} for key, word in word_dict.items(): iword_dict[word] = key return iword_dict def load_fasttext(self, iword_dict): embed_dict = {} print(self.fasttext_path) model = FastText(self.fasttext_path) for word, key in iword_dict.items(): embed_dict[key] = model[word] # print(embed_dict[key]) print('Embedding size: %d' % (len(embed_dict))) return embed_dict def embed_to_numpy(self, embed_dict): feat_size = len(embed_dict[list(embed_dict.keys())[0]]) if self.reserve_embed: embed = np.zeros((len(embed_dict) + 200000, feat_size), np.float32) else: embed = np.zeros((len(embed_dict), feat_size), np.float32) for k in embed_dict: embed[k] = np.asarray(embed_dict[k]) print('Generate numpy embed:', embed.shape) return embed def sent_2_index(self, sent, word_dict, max_length): words = self.nltk_tokenizer(sent) word_indexes = [] for idx, word in enumerate(words): if idx >= max_length: break else: word_indexes.append(word_dict[word.lower()]) return word_indexes def train_data_indexes(self, X, word_dict): X_indexes = [] print("start index words into intergers") for claim, pos, neg in X: claim_indexes = self.sent_2_index(claim, word_dict, self.h_max_length) pos_indexes = self.sent_2_index(pos, word_dict, self.s_max_length) neg_indexes = self.sent_2_index(neg, word_dict, self.s_max_length) X_indexes.append((claim_indexes, pos_indexes, neg_indexes)) print('Training data size:', len(X_indexes)) return X_indexes def predict_data_indexes(self, data, word_dict): devs_indexes = [] for dev in data: sent_indexes = [] claim = dev[0][0] claim_index = self.sent_2_index(claim, word_dict, self.h_max_length) claim_indexes = [claim_index] * len(dev) for claim, sent in dev: sent_index = self.sent_2_index(sent, word_dict, self.s_max_length) sent_indexes.append(sent_index) assert len(sent_indexes) == len(claim_indexes) dev_indexes = list(zip(claim_indexes, sent_indexes)) devs_indexes.append(dev_indexes) return devs_indexes def get_complete_words(self, words_dict_path, train_data, dev_data, test_data): all_words = set() train_words = self.get_train_words(train_data) all_words.update(train_words) dev_words = self.get_predict_words(dev_data) all_words.update(dev_words) test_words = self.get_predict_words(test_data) all_words.update(test_words) word_dict = self.word_2_dict(all_words) with open(words_dict_path, "wb") as f: pickle.dump(word_dict, f) return word_dict def train_indexes_loader(self, train_indexes_path, train_data): if os.path.exists(train_indexes_path): with open(train_indexes_path, "rb") as f: X_indexes = pickle.load(f) else: X_indexes = self.train_data_indexes(train_data, self.iword_dict) with open(train_indexes_path, "wb") as f: pickle.dump(X_indexes, f) return X_indexes def predict_indexes_loader(self, predict_indexes_path, predict_data): if os.path.exists(predict_indexes_path): with open(predict_indexes_path, "rb") as f: predicts_indexes = pickle.load(f) else: predicts_indexes = self.predict_data_indexes( predict_data, self.iword_dict) with open(predict_indexes_path, "wb") as f: pickle.dump(predicts_indexes, f) return predicts_indexes def update_word_dict(self, test_path): self.new_test_datapath = os.path.join(self.embedding_path, "new_test_data.p") new_tests, self.test_location_indexes = self.predict_data_loader( self.new_test_datapath, test_path) new_test_words = self.get_predict_words(new_tests) print(len(self.iword_dict)) print(len(self.word_dict)) self.test_words_dict = {} for word in new_test_words: if word not in self.iword_dict: idx = len(self.word_dict) self.word_dict[idx] = word self.test_words_dict[idx] = word self.iword_dict = self.inverse_word_dict(self.word_dict) self.test_iword_dict = self.inverse_word_dict(self.test_words_dict) print("updated iword dict size: ", len(self.iword_dict)) print("test iword dict size: ", len(self.test_iword_dict)) def update_embeddings(self): test_embed_dict = self.load_fasttext(self.test_iword_dict) for k in test_embed_dict: self.embed[k] = np.asarray(test_embed_dict[k]) print("updated embed size: ", self.embed.shape) def get_new_test_indexes(self, test_path): new_tests, self.new_test_location_indexes = self.predict_data_loader( self.new_test_datapath, test_path) new_tests_indexes_path = os.path.join(self.embedding_path, "new_test_indexes.p") self.new_tests_indexes = self.predict_indexes_loader( new_tests_indexes_path, new_tests)
parser.add_argument('--docs', type=str, default='data/tmp/doc_preds.jsonl', help='file containing the predicted documents', ) parser.add_argument('--sen_output', type=str, default='data/tmp/sen_preds.jsonl', help='file to write predicted sentences to', ) parser.add_argument('--train_ds', type=str, default='data/fever-data/train.jsonl', help='training dataset', ) parser.add_argument('--dev_ds', type=str, default='data/fever-data/dev.jsonl', help='development dataset', ) args = parser.parse_args() out_dir = os.path.dirname(os.path.realpath(args.output)) os.makedirs(out_dir, exist_ok=True) ##### DOCUMENT RETREIVAL IMPLEMENTATION ##### doc_retriever = DrqaDocRetriever(args.model) ############################################# logger.info("Load DB") db = FeverDocDB(args.db) jlr = JSONLineReader() formatter = FEVERGoldFormatter(set(), FEVERLabelSchema()) logger.info("Read datasets") train_ds = DataSet(file=args.train_ds, reader=jlr, formatter=formatter) dev_ds = DataSet(file=args.dev_ds, reader=jlr, formatter=formatter) train_ds.read() dev_ds.read() logger.info("Generate vocab for TF-IDF") tf = XTermFrequencyFeatureFunction(db) tf.inform(train_ds.data, dev_ds.data) ##### SENTENCE RETREIVAL IMPLEMENTATION ############
evidence_pages.add(sample) if len(evidence_pages) < k: samples = random.sample(doc_ids, k - len(evidence_pages)) for sample in samples: evidence_pages.add(sample) elif len(evidence_pages) >= k: samples = random.sample(evidence_pages, k) evidence_pages = set(samples) return evidence_pages path = os.getcwd() path = re.sub("/src.*", "", path) db = FeverDocDB(os.path.join(path, "data/fever/fever.db")) doc_ids = db.get_doc_ids() doc_ids = doc_ids[1:] jlr = JSONLineReader() # with open(os.path.join(path, "data/fever-data/train.jsonl"), "r") as f: # with open(os.path.join(path, 'data/fever/train.p5.jsonl'), "w") as f2: # lines = f.readlines() # for line in lines: # js = json.loads(line) # pages = sample_doc(js,doc_ids,k=5) # js['predicted_pages'] = list(pages) # f2.write(json.dumps(js)+"\n") with open(os.path.join(path, "data/fever-data/dev.jsonl"), "r") as f: with open(os.path.join(path, "data/fever/dev.p5.jsonl"), "w") as f2: lines = f.readlines()
def train_sample(db_filename, lines, list_size=15): db = FeverDocDB(db_filename) claims = [] list_sents = [] labels = [] count = 0 for idx, line in tqdm(enumerate(lines)): if line['label'].upper() == "NOT ENOUGH INFO": continue claim = line['claim'] claims.append(claim) sents = [] label = [] pos_set = set() neg_sents = [] for evidence_group in line['evidence']: pos_sent = get_whole_evidence(evidence_group, db) if pos_sent in pos_set: continue pos_set.add(pos_sent) p_lines = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page[0] for page in line['predicted_pages'] if page[0] is not None ] for page, num in evidence_set: pages.append(page) pages = set(pages) for page in pages: doc_lines = db.get_doc_lines(page) p_lines.extend(get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue if (doc_line[1], doc_line[2]) not in evidence_set: neg_sents.append(doc_line[0]) pos_set = list(pos_set) if len(pos_set) > 5: pos_set = random.sample(pos_set, 5) if len(neg_sents) < (list_size - len(pos_set)): count += 1 continue else: samples = random.sample(neg_sents, list_size - len(pos_set)) pos_indexes_sample = random.sample(range(list_size), len(pos_set)) neg_index = 0 pos_index = 0 for i in range(list_size): if i in pos_indexes_sample: sents.append(pos_set[pos_index]) label.append(1 / len(pos_set)) pos_index += 1 else: sents.append(samples[neg_index]) label.append(0.0) neg_index += 1 if idx % 1000 == 0: print(claim) print(sents) print(label) list_sents.append(sents) labels.append(label) print(count) return claims, list_sents, labels
class ELMO_Data(object): def __init__(self, base_path, train_file, dev_file, test_file, num_negatives, h_max_length, s_max_length, random_seed=100, db_filepath="data/fever/fever.db"): self.random_seed = random_seed self.base_path = base_path self.train_file = train_file self.dev_file = dev_file self.test_file = test_file self.num_negatives = num_negatives self.h_max_length = h_max_length self.s_max_length = s_max_length self.db_filepath = db_filepath self.db = FeverDocDB(self.db_filepath) self.data_pipeline() def data_pipeline(self): np.random.seed(self.random_seed) random.seed(self.random_seed) # create diretory to store sampling data and processed data base_dir = os.path.join(self.base_path, "data/train_data") store_dir = "data.h{}.s{}.seed{}".format(self.h_max_length, self.s_max_length, self.random_seed) absou_dir = os.path.join(base_dir, store_dir) if not os.path.exists(absou_dir): os.makedirs(absou_dir) train_data_path = os.path.join(absou_dir, "train_sample.p") X_train = self.train_data_loader(train_data_path, self.train_file, num_samples=self.num_negatives) dev_datapath = os.path.join(absou_dir, "dev_data.p") devs, self.dev_labels = self.dev_data_loader(dev_datapath, self.dev_file) test_datapath = os.path.join(absou_dir, "test_data.p") tests, self.test_location_indexes = self.predict_data_loader( test_datapath, self.test_file) self.X_train = self.train_data_tokenizer(X_train) self.devs = self.predict_data_tokenizer(devs) self.tests = self.predict_data_tokenizer(tests) return self def get_whole_evidence(self, evidence_set, db): pos_sents = [] for evidence in evidence_set: page = evidence[2] doc_lines = db.get_doc_lines(page) doc_lines = self.get_valid_texts(doc_lines, page) for doc_line in doc_lines: if doc_line[2] == evidence[3]: pos_sents.append(doc_line[0]) pos_sent = ' '.join(pos_sents) return pos_sent def get_valid_texts(self, lines, page): if not lines: return [] doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in lines.split("\n") ] doc_lines = zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines))) return doc_lines def sampling(self, datapath, num_sample=1): jlr = JSONLineReader() X = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) for line in tqdm(lines): count += 1 pos_pairs = [] # count1 += 1 if line['label'].upper() == "NOT ENOUGH INFO": continue neg_sents = [] claim = line['claim'] pos_set = set() for evidence_set in line['evidence']: pos_sent = self.get_whole_evidence(evidence_set, self.db) if pos_sent in pos_set: continue pos_set.add(pos_sent) p_lines = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if (doc_line[1], doc_line[2]) not in evidence_set: neg_sents.append(doc_line[0]) num_sampling = num_sample if len(neg_sents) < num_sampling: num_sampling = len(neg_sents) # print(neg_sents) if num_sampling == 0: continue else: for pos_sent in pos_set: samples = random.sample(neg_sents, num_sampling) for sample in samples: if not sample: continue X.append((claim, pos_sent, sample)) if count % 1000 == 0: print( "claim:{} ,evidence :{} sample:{}".format( claim, pos_sent, sample)) return X def predict_processing(self, datapath): jlr = JSONLineReader() devs = [] all_indexes = [] with open(datapath, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] pages = set() # pages = line['predicted_pages'] pages.update(page for page in line['predicted_pages']) # if len(pages) == 0: # pages.add("Michael_Hutchence") claim = line['claim'] p_lines = [] for page in pages: doc_lines = self.db.get_doc_lines(page) if not doc_lines: continue p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((claim, doc_line[0])) indexes.append((doc_line[1], doc_line[2])) # print(len(dev)) if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes def dev_processing(self, data_path): jlr = JSONLineReader() with open(data_path, "r") as f: lines = jlr.process(f) devs = [] labels = [] for line in tqdm(lines): dev = [] label = [] if line['label'].upper() == "NOT ENOUGH INFO": continue evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page, num in evidence_set: pages.append(page) pages = set(pages) p_lines = [] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((line['claim'], doc_line[0])) if (doc_line[1], doc_line[2]) in evidence_set: label.append(1) else: label.append(0) if len(dev) == 0 or len(label) == 0: continue devs.append(dev) labels.append(label) return devs, labels def train_data_loader(self, train_sampled_path, data_path, num_samples=1): if os.path.exists(train_sampled_path): with open(train_sampled_path, 'rb') as f: X = pickle.load(f) else: X = self.sampling(data_path, num_samples) with open(train_sampled_path, 'wb') as f: pickle.dump(X, f) return X def dev_data_loader(self, dev_data_path, data_path): if os.path.exists(dev_data_path): with open(dev_data_path, "rb") as f: data = pickle.load(f) devs, labels = zip(*data) else: devs, labels = self.dev_processing(data_path) data = zip(devs, labels) with open(dev_data_path, 'wb') as f: pickle.dump(data, f) return devs, labels def predict_data_loader(self, predict_data_path, data_path): if os.path.exists(predict_data_path): print(predict_data_path) with open(predict_data_path, "rb") as f: data = pickle.load(f) devs, location_indexes = zip(*data) else: devs, location_indexes = self.predict_processing(data_path) data = zip(devs, location_indexes) with open(predict_data_path, 'wb') as f: pickle.dump(data, f) return devs, location_indexes def sent_processing(self, sent): sent = sent.replace('\n', '') sent = sent.replace('-', ' ') sent = sent.replace('/', ' ') return sent def nltk_tokenizer(self, sent): # sent = sent_processing(sent) return nltk.word_tokenize(sent) def proess_sents(self, sents, max_length): tokenized_sents = [] sents_lengths = [] for sent in sents: words = [word.lower() for word in nltk.word_tokenize(sent)] if len(words) < self.h_max_length: sents_lengths.append(len(words)) words.extend([""] * (self.h_max_length - len(words))) tokenized_sents.append(words) else: sents_lengths.append(self.h_max_length) words = words[:self.h_max_length] tokenized_sents.append(words) return tokenized_sents, sents_lengths def train_data_tokenizer(self, X_train): claims = [claim for claim, _, _ in X_train] pos_sents = [pos_sent for _, pos_sent, _ in X_train] neg_sents = [neg_sent for _, _, neg_sent in X_train] tokenized_claims, claims_lengths = self.proess_sents( claims, self.h_max_length) tokenized_pos_sents, pos_sents_lengths = self.proess_sents( pos_sents, self.s_max_length) tokenized_neg_sents, neg_sents_lengths = self.proess_sents( neg_sents, self.s_max_length) new_claims = list(zip(tokenized_claims, claims_lengths)) new_pos_sents = list(zip(tokenized_pos_sents, pos_sents_lengths)) new_neg_sents = list(zip(tokenized_neg_sents, neg_sents_lengths)) return list(zip(new_claims, new_pos_sents, new_neg_sents)) def predict_data_tokenizer(self, dataset): predict_data = [] for data in dataset: claims = [claim for claim, _ in data] sents = [sent for _, sent in data] tokenized_claims, claims_lengths = self.proess_sents( claims, self.h_max_length) tokenized_sents, sents_lengths = self.proess_sents( sents, self.s_max_length) new_claims = list(zip(tokenized_claims, claims_lengths)) new_sents = list(zip(tokenized_sents, sents_lengths)) tokenized_data = list(zip(new_claims, new_sents)) predict_data.append(tokenized_data) return predict_data
def sample_ranking_train(db_filename, datapath, k=5, num_sample=2): """ :param db_filename: path stores wiki-pages database :param datapath: path stores fever predicted pages train set :param k: number of sentences where to select negative examples :param num_sample: number of negative examples to sample :return: X: claim and sentence pairs y: if the sentence in evidence set """ db = FeverDocDB(db_filename) jlr = JSONLineReader() X_claim = [] X_sents = [] y = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) # lines = lines[:1000] for line in tqdm(lines): num_sampling = num_sample if line['label'].upper() == "NOT ENOUGH INFO": continue p_lines = [] neg_sents = [] claim = line['claim'] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) sampled_sents_idx = [(id, number) for id, number in line['predicted_sentences']] sampled_sents_idx = sampled_sents_idx[0:k + 5] sampled_sents_idx = [ index for index in sampled_sents_idx if index not in evidence_set ] pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) pages.update(evidence[0] for evidence in evidence_set) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] p_lines.extend( zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) for doc_line in p_lines: if not doc_line[0]: continue elif (doc_line[1], doc_line[2]) in sampled_sents_idx: neg_sents.append(doc_line[0]) elif (doc_line[1], doc_line[2]) in evidence_set: X_claim.append(claim) X_sents.append(doc_line[0]) y.append(1) if len(sampled_sents_idx) < num_sample: count += 1 num_sampling = len(sampled_sents_idx) samples = random.sample(neg_sents, num_sampling) for neg_example in samples: X_claim.append(claim) X_sents.append(neg_example) y.append(0) print(count) return X_claim, X_sents, y
class Doc_Retrieval: def __init__(self, database_path, add_claim=False, k_wiki_results=None): self.db = FeverDocDB(database_path) self.add_claim = add_claim self.k_wiki_results = k_wiki_results self.proter_stemm = nltk.PorterStemmer() self.tokenizer = nltk.word_tokenize #print("Va a descargar") #get_spacy_model('en_core_web_lg',True,False,False) self.predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz") #print("Descargó") def get_NP(self, tree, nps): if isinstance(tree, dict): if "children" not in tree: if tree['nodeType'] == "NP": # print(tree['word']) # print(tree) nps.append(tree['word']) elif "children" in tree: if tree['nodeType'] == "NP": # print(tree['word']) nps.append(tree['word']) self.get_NP(tree['children'], nps) else: self.get_NP(tree['children'], nps) elif isinstance(tree, list): for sub_tree in tree: self.get_NP(sub_tree, nps) return nps def get_subjects(self, tree): subject_words = [] subjects = [] for subtree in tree['children']: if subtree['nodeType'] == "VP" or subtree['nodeType'] == 'S' or subtree['nodeType'] == 'VBZ': subjects.append(' '.join(subject_words)) subject_words.append(subtree['word']) else: subject_words.append(subtree['word']) return subjects def get_noun_phrases(self, line): claim = line['claim'] tokens = self.predictor.predict(claim) nps = [] tree = tokens['hierplane_tree']['root'] noun_phrases = self.get_NP(tree, nps) subjects = self.get_subjects(tree) for subject in subjects: if len(subject) > 0: noun_phrases.append(subject) if self.add_claim: noun_phrases.append(claim) return list(set(noun_phrases)) def get_doc_for_claim(self, noun_phrases): predicted_pages = [] for np in noun_phrases: if len(np) > 300: continue i = 1 while i < 12: try: docs = wikipedia.search(np) if self.k_wiki_results is not None: predicted_pages.extend(docs[:self.k_wiki_results]) else: predicted_pages.extend(docs) except (ConnectionResetError, ConnectionError, ConnectionAbortedError, ConnectionRefusedError): print("Connection reset error received! Trial #" + str(i)) time.sleep(600 * i) i += 1 else: break # sleep_num = random.uniform(0.1,0.7) # time.sleep(sleep_num) predicted_pages = set(predicted_pages) processed_pages = [] for page in predicted_pages: page = page.replace(" ", "_") page = page.replace("(", "-LRB-") page = page.replace(")", "-RRB-") page = page.replace(":", "-COLON-") processed_pages.append(page) return processed_pages def np_conc(self, noun_phrases): noun_phrases = set(noun_phrases) predicted_pages = [] for np in noun_phrases: page = np.replace('( ', '-LRB-') page = page.replace(' )', '-RRB-') page = page.replace(' - ', '-') page = page.replace(' :', '-COLON-') page = page.replace(' ,', ',') page = page.replace(" 's", "'s") page = page.replace(' ', '_') if len(page) < 1: continue doc_lines = self.db.get_doc_lines(page) if doc_lines is not None: predicted_pages.append(page) return predicted_pages def exact_match(self, line): noun_phrases = self.get_noun_phrases(line) wiki_results = self.get_doc_for_claim(noun_phrases) wiki_results = list(set(wiki_results)) claim = normalize(line['claim']) claim = claim.replace(".", "") claim = claim.replace("-", " ") words = [self.proter_stemm.stem(word.lower()) for word in self.tokenizer(claim)] words = set(words) predicted_pages = self.np_conc(noun_phrases) for page in wiki_results: page = normalize(page) processed_page = re.sub("-LRB-.*?-RRB-", "", page) processed_page = re.sub("_", " ", processed_page) processed_page = re.sub("-COLON-", ":", processed_page) processed_page = processed_page.replace("-", " ") processed_page = processed_page.replace("–", " ") processed_page = processed_page.replace(".", "") page_words = [self.proter_stemm.stem(word.lower()) for word in self.tokenizer(processed_page) if len(word) > 0] if all([item in words for item in page_words]): if ':' in page: page = page.replace(":", "-COLON-") predicted_pages.append(page) predicted_pages = list(set(predicted_pages)) # print("claim: ",claim) # print("nps: ",noun_phrases) # print("wiki_results: ",wiki_results) # print("predicted_pages: ",predicted_pages) # print("evidence:",line['evidence']) return noun_phrases, wiki_results, predicted_pages
type=str, default='simple', help=("String option specifying tokenizer type to use " "(e.g. 'corenlp')")) parser.add_argument('--num-workers', type=int, default=None, help='Number of CPU processes (for tokenizing, etc)') args = parser.parse_args() doc_freqs = None if args.use_precomputed: _, metadata = utils.load_sparse_csr(args.model) doc_freqs = metadata['doc_freqs'].squeeze() db = FeverDocDB("data/fever/fever.db") jlr = JSONLineReader() formatter = FEVERGoldFormatter(set(), FEVERLabelSchema()) jlr = JSONLineReader() with open(args.in_file, "r") as f, open( "data/fever/{0}.sentences.{3}.p{1}.s{2}.jsonl".format( args.split, args.max_page, args.max_sent, "precomputed" if args.use_precomputed else "not_precomputed"), "w+") as out_file: lines = jlr.process(f) #lines = tf_idf_claims_batch(lines) for line in tqdm(lines): line = tf_idf_claim(line)
def eval_model(db: FeverDocDB, args) -> Model: archive = load_archive(args.archive_file, cuda_device=args.cuda_device, overrides=args.overrides) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() reader = FEVERReader(db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {}))) while True: claim = input("enter claim (or q to quit) >>") if claim.lower() == "q": break ranker = retriever.get_class('tfidf')(tfidf_path=args.model) p_lines = [] pages, _ = ranker.closest_docs(claim, 5) for page in pages: lines = db.get_doc_lines(page) lines = [ line.split("\t")[1] if len(line.split("\t")[1]) > 1 else "" for line in lines.split("\n") ] p_lines.extend(zip(lines, [page] * len(lines), range(len(lines)))) scores = tf_idf_sim(claim, [pl[0] for pl in p_lines]) scores = list( zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines], [pl[0] for pl in p_lines])) scores = list(filter(lambda score: len(score[3].strip()), scores)) sentences_l = list( sorted(scores, reverse=True, key=lambda elem: elem[0])) sentences = [s[3] for s in sentences_l[:5]] evidence = " ".join(sentences) print("Best pages: {0}".format(repr(pages))) print("Evidence:") for idx, sentence in enumerate(sentences_l[:5]): print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0], sentence[1], sentence[3])) item = reader.text_to_instance(evidence, claim) prediction = model.forward_on_instance(item, args.cuda_device) cls = model.vocab._index_to_token["labels"][np.argmax( prediction["label_probs"])] print("PREDICTED: {0}".format(cls)) print()