def prediction_processing(dataset_path, predictions): """ process the predicted (doc_id,sent_id) pairs to the score system desired format :param dataset_path: :param predictions: :return: """ final_predictions = [] jsr = JSONLineReader() with open(dataset_path, "r") as f: lines = jsr.process(f) # # lines = lines[:100] for idx, line in enumerate(lines): if len(line['predicted_pages']) == 0: line['predicted_evidence'] = [] else: line['predicted_evidence'] = [[ prediction[0], int(prediction[1]) ] for prediction in predictions[idx]] line['predicted_label'] = "REFUTES" final_predictions.append(line) return final_predictions
def number_feature(data_set_path: str, db_path: str, max_sent_num: int): from common.dataset.reader import JSONLineReader db = FeverDocDB(db_path) jlr = JSONLineReader() lines = jlr.read(data_set_path) num_feat = np.zeros([len(lines), max_sent_num, 3], dtype=np.int32) for i, line in enumerate(lines): claim_text = line['claim'] claim_tokens = tokenize(claim_text) all_nums = set() for token in claim_tokens: if is_token_numeric(token): all_nums.add(float(token)) for j, evidence in enumerate(line['predicted_evidence']): if j >= max_sent_num: break page, line_num = evidence[-2], evidence[-1] all_evidence_nums = [] evidence_text = evidence_num_to_text(db, page, line_num) evidence_tokens = tokenize(evidence_text) for token in evidence_tokens: if is_token_numeric(token): all_evidence_nums.append(float(token)) has_num = len(all_evidence_nums) > 0 has_identical_num = any(n in all_nums for n in all_evidence_nums) has_different_num = any(n not in all_nums for n in all_evidence_nums) num_feat[i][j][0], num_feat[i][j][1], num_feat[i][j][ 2] = _interprete_num_result(has_num, has_identical_num, has_different_num) return num_feat
def sampling(self, datapath, num_sample=1): jlr = JSONLineReader() X = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) for line in tqdm(lines): count += 1 pos_pairs = [] # count1 += 1 if line['label'].upper() == "NOT ENOUGH INFO": continue neg_sents = [] claim = line['claim'] pos_set = set() for evidence_set in line['evidence']: pos_sent = self.get_whole_evidence(evidence_set, self.db) if pos_sent in pos_set: continue pos_set.add(pos_sent) p_lines = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if (doc_line[1], doc_line[2]) not in evidence_set: neg_sents.append(doc_line[0]) num_sampling = num_sample if len(neg_sents) < num_sampling: num_sampling = len(neg_sents) # print(neg_sents) if num_sampling == 0: continue else: for pos_sent in pos_set: samples = random.sample(neg_sents, num_sampling) for sample in samples: if not sample: continue X.append((claim, pos_sent, sample)) if count % 1000 == 0: print( "claim:{} ,evidence :{} sample:{}".format( claim, pos_sent, sample)) return X
def generate_submission(_predictions, _ids, test_set_path, submission_path): """ Generate submission file for shared task: http://fever.ai/task.html :param _ids: :param _predictions: :param test_set_path: :param submission_path: :return: """ from common.dataset.reader import JSONLineReader from tqdm import tqdm import json _predictions_with_id = list(zip(_ids, _predictions)) jlr = JSONLineReader() json_lines = jlr.read(test_set_path) os.makedirs(os.path.dirname(os.path.abspath(submission_path)), exist_ok=True) with open(submission_path, 'w') as f: for line in tqdm(json_lines): for i, evidence in enumerate(line['predicted_evidence']): line['predicted_evidence'][i][0] = normalize(evidence[0]) _id = line['id'] _pred_label = prediction_2_label(2) for _pid, _plabel in _predictions_with_id: if _pid == _id: _pred_label = prediction_2_label(_plabel) break obj = {"id": _id,"predicted_label": _pred_label,"predicted_evidence": line['predicted_evidence']} f.write(json.dumps(obj)) f.write('\n')
def main(db_file, k_wiki, in_file, out_file, add_claim=True, parallel=True): # tfidf_path = "data/index/fever-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz" method = Doc_Retrieval(database_path=db_file, add_claim=add_claim, k_wiki_results=k_wiki) processed = dict() path = os.getcwd() jlr = JSONLineReader() lines = jlr.read(os.path.join(path, in_file)) if os.path.isfile(os.path.join(path, in_file + ".progress")): with open(os.path.join(path, in_file + ".progress"), 'rb') as f_progress: import pickle progress = pickle.load(f_progress) print(os.path.join(path, in_file + ".progress") + " exists. Load it as progress file.") else: progress = dict() try: with ThreadPool(processes=4 if parallel else None) as p: for line in tqdm( get_map_function(parallel, p)(lambda l: process_line_with_progress(method, l, progress), lines), total=len(lines)): processed[line['id']] = line progress[line['id']] = line # time.sleep(0.5) with open(os.path.join(path, out_file), "w+") as f2: for line in lines: f2.write(json.dumps(processed[line['id']]) + "\n") finally: with open(os.path.join(path, in_file + ".progress"), 'wb') as f_progress: import pickle pickle.dump(progress, f_progress, pickle.HIGHEST_PROTOCOL)
def load_feature_by_data_set(data_set_path: str, feature_path: str, max_sent_num: int): from common.dataset.reader import JSONLineReader import pickle import os with open(os.path.join(feature_path, 'feature.p'), 'rb') as f: features = pickle.load(f) with open(os.path.join(feature_path, 'data_idx_map.p'), 'rb') as f: data_idx_map = pickle.load(f) jlr = JSONLineReader() lines = jlr.read(data_set_path) feature_dim = features.shape[1] padding = np.zeros([feature_dim], np.float32) claim_features = [] evidence_features = [] for line in lines: _id = line['id'] key = _concat_sent(CLAIM, _id) claim_features.append(features[data_idx_map[key]]) evidence_per_claim_features = [] for sent in line['predicted_evidence']: page, line_num = sent[-2], sent[-1] key = _concat_sent(page, line_num) evidence_per_claim_features.append(features[data_idx_map[key]]) if len(evidence_per_claim_features) > max_sent_num: evidence_features.append( evidence_per_claim_features[:max_sent_num]) else: for _ in range(max_sent_num - len(evidence_per_claim_features)): evidence_per_claim_features.append(padding) evidence_features.append(evidence_per_claim_features) return np.asarray(claim_features, np.float32), np.asarray(evidence_features, np.float32)
def train_dev_split(train_datapath, split_rate): with open(train_datapath, "r") as f: jlr = JSONLineReader() lines = jlr.process(f) random.shuffle(lines) dev_lines = lines[:int(len(lines) * split_rate)] train_lines = lines[int(len(lines) * split_rate):] return train_lines, dev_lines
def load_words(embedding_file, train_datapath, test_path, db_filename, num_sample, sampled_path): words = set() def _insert(iterable): for w in iterable: w = Dictionary.normalize(w) if valid_words and w not in valid_words: continue words.add(w) valid_words = index_embedding_words(embedding_file) X_claim, X_sents, y = load_generate_samples(db_filename, train_datapath, num_sample, sampled_path) X_claim = set(X_claim) for claim in X_claim: words = nltk.word_tokenize(claim) _insert(words) for sent in X_sents: words = simple_tokenizer(sent) _insert(words) with open(test_path, "r") as f: jlr = JSONLineReader() db = FeverDocDB(db_filename) lines = jlr.process(f) for line in lines: claim = line['claim'] words = nltk.word_tokenize(claim) _insert(words) evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) pages.update(evidence[0] for evidence in evidence_set) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] doc_lines = [doc_line for doc_line in doc_lines if doc_line] for doc_line in doc_lines: words = simple_tokenizer(doc_line) _insert(words) return words
def test_data_4_siamese(db_filename, dataset_path): db = FeverDocDB(db_filename) jlr = JSONLineReader() X_claims = [] X_sents = [] all_sents_id = [] with open(dataset_path, "r") as f: lines = jlr.process(f) # lines = lines[:1000] for line in tqdm(lines): claims = [] sents = [] sents_indexes = [] p_lines = [] claim = line['claim'] pages = set() pages.update(evidence[0] for evidence in line['predicted_pages']) for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue doc_lines = [ doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in doc_lines.split("\n") ] p_lines.extend( zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines)))) for doc_line in p_lines: if not doc_line[0]: continue else: claims.append(claim) sents.append(doc_line[0]) sents_indexes.append((doc_line[1], doc_line[2])) X_claims.append(claims) X_sents.append(sents) all_sents_id.append(sents_indexes) # print(len(X_claims)) # print(len(X_sents)) # print(len(all_sents_id)) # X_claims_indexes, X_sents_indexes = [], [] # for idx, claims in enumerate(X_claims): # claims_index, sents_index = data_transformer(claims, X_sents[idx], word_dict) # X_claims_indexes.append(claims_index) # X_sents_indexes.append(sents_index) return X_claims, X_sents, all_sents_id
def test_data(db_path, dataset_path, type="ranking"): """ generate dev examples to feed into the classifier :param db_path: :param dataset_path: :param type: :return: """ with open(db_path) as f: db = json.load(f) jsr = JSONLineReader() inputs = [] X_claim = [] X_sents = [] indexes = [] with open(dataset_path, "r") as f: lines = jsr.process(f) for line in tqdm(lines): valid_lines = [] claims = [] sents_idnexes = [] claim = line['claim'] #for doc_line in p_lines: doc = line['predicted_evidence'] # doc = line['evidence'] for doc_line in doc: if not doc_line: continue else: # print(doc_line[0]) if type == "cos": sents_idnexes.append(doc_line) valid_lines.append( get_whole_evidence([doc_line], db)) claims.append(claim) elif type == "ranking": sents_idnexes.append((doc_line[0], doc_line[1])) valid_lines.append( (claim, get_whole_evidence([doc_line], db))) if type == "cos": X_sents.append(valid_lines) X_claim.append(claims) elif type == "ranking": inputs.append(valid_lines) indexes.append(sents_idnexes) inputs = list(zip(X_claim, X_sents)) return inputs, indexes
def test_data_loader(save_path, db_filename=None, data_path=None): if os.path.exists(save_path): with open(save_path, 'rb') as f: X = pickle.load(f) claims, list_sents, sents_indexes = zip(*X) else: with open(data_path, "rb") as f: jlr = JSONLineReader() lines = jlr.process(f) claims, list_sents, sents_indexes = test_processing(db_filename, lines) X = zip(claims, list_sents, sents_indexes) with open(save_path, 'wb') as f: pickle.dump(X, f) return claims, list_sents, sents_indexes
def sampling(self,datapath,num_sample=1): jlr = JSONLineReader() X = [] count = 0 with open(datapath, "r") as f: lines = jlr.process(f) for line in tqdm(lines): print('line: ', line) count += 1 if line['label'].upper() == "NOT ENOUGH INFO": continue neg_sents = [] claim = line['claim'] print('claim: ',claim) pos_set = set() pos_set_ref = [] for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add(self.get_whole_evidence([evidence_sentence], self.db)) pos_set_ref.append(evidence_sentence) print('pos_set: ', pos_set) neg_sents = [] for neg_evidence in line['predicted_evidence']: # if neg_evidence not in evidence_set: if neg_evidence not in pos_set_ref: neg_sents.append(self.get_whole_evidence([neg_evidence], self.db)) num_sampling = num_sample if len(neg_sents) < num_sampling: num_sampling = len(neg_sents) # print(neg_sents) if num_sampling == 0: continue else: for pos_sent in pos_set: samples = random.sample(neg_sents, num_sampling) for sample in samples: print('sample: ',sample) if not sample: continue X.append((claim, pos_sent, sample)) if count % 1000 == 0: print("claim:{} ,evidence :{} sample:{}".format(claim, pos_sent, sample)) return X
def prediction_processing(dataset_path, predictions): """ process the predicted (doc_id,sent_id) pairs to the score system desired format :param dataset_path: :param predictions: :return: """ final_predictions = [] jsr = JSONLineReader() with open(dataset_path, "r") as f: lines = jsr.process(f) prediction_processing_no_reload(lines, predictions) return final_predictions
def prediction_processing(dataset_path, predictions, db_filename): """ process the predicted (doc_id,sent_id) pairs to the score system desired format :param dataset_path: :param predictions: :return: """ final_predictions = [] jsr = JSONLineReader() with open(db_filename) as f: db = json.load(f) out_error_ana = [] with open(dataset_path, "r") as f: lines = jsr.process(f) cnt = 0 for line in lines: pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue line['predicted_evidence'] = [[prediction[0], int(prediction[1])] for prediction in predictions[cnt]] cnt_gold = 0 out_error_ana.append("Claim: "+str(cnt)) out_error_ana.append(line['claim']) out_error_ana.append("Gold evidence:") for evidence_set in line['evidence']: for evidence_sentence in evidence_set: out_error_ana.append(get_whole_evidence([evidence_sentence], db)) cnt_gold += 1 out_error_ana.append("Predicted evidence:") for evidence_set in line['predicted_evidence'][:cnt_gold]: out_error_ana.append(get_whole_evidence([evidence_set], db)) out_error_ana.append("") line['predicted_label'] = "refutes" final_predictions.append(line) cnt += 1 if cnt == len(predictions): break return final_predictions, out_error_ana
def sampling(self, datapath, num_sample=1): jlr = JSONLineReader() ret = [] print("sampling for " + datapath) with open(datapath, "r") as f: lines = jlr.process(f) print(len(lines)) with ThreadPool(processes=48) as p: for line in tqdm(p.imap(lambda x: self.handle(x, num_sample), lines), total=len(lines)): if line is not None: ret.extend(line) print("Done") return ret
def dev_processing(self, data_path): jlr = JSONLineReader() with open(data_path, "r") as f: lines = jlr.process(f) devs = [] labels = [] for line in tqdm(lines): dev = [] label = [] if line['label'].upper() == "NOT ENOUGH INFO": continue evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page, num in evidence_set: pages.append(page) pages = set(pages) p_lines = [] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((line['claim'], doc_line[0])) if (doc_line[1], doc_line[2]) in evidence_set: label.append(1) else: label.append(0) if len(dev) == 0 or len(label) == 0: continue devs.append(dev) labels.append(label) return devs, labels
def cos_train(db_filepath, dataset_path): """ Use the cosine similarity score to rank (claim,sentence) pair in the dev set don't need training data :param db_filepath: :param dataset_path: :return: """ with open(db_filepath) as f: db = json.load(f) jlr = JSONLineReader() X = [] y = [] with open(dataset_path, "r") as f: lines = jlr.process(f) for line in tqdm(lines): if line['label'] == "NOT ENOUGH INFO": continue #label, dev = [], [] pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue pos_set = set() for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add(get_whole_evidence([evidence_sentence], db)) for evidence_sentence_ref in line['predicted_evidence']: evidence_sentence = get_whole_evidence( [evidence_sentence_ref], db) X.append((line['claim'], evidence_sentence)) if evidence_sentence in pos_set: y.append(1) else: y.append(0) return X, y
def data_processing_for_joint(self, data_path): from athene.rte.utils.data_reader import label_dict jlr = JSONLineReader() with open(data_path, "r") as f: lines = jlr.process(f) datas = [] sent_labels = [] claim_labels = [] for line in tqdm(lines): claim_labels.append(label_dict.index(line['label'])) data = [] sent_label = [] evidence_set = set([(evidence[2], evidence[3]) for evidences in line['evidence'] for evidence in evidences]) pages = [ page for page in line['predicted_pages'] if page is not None ] for page, num in evidence_set: pages.append(page) pages = set(pages) p_lines = [] for page in pages: doc_lines = self.db.get_doc_lines(page) p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue data.append((line['claim'], doc_line[0])) if (doc_line[1], doc_line[2]) in evidence_set: sent_label.append(1) else: sent_label.append(0) if len(data) == 0 or len(sent_label) == 0: continue datas.append(data) sent_labels.append(sent_label) return datas, sent_labels, claim_labels
def predict_processing(db_path, dataset_path): with open(db_path) as f: db = json.load(f) jlr = JSONLineReader() devs = [] all_indexes = [] with open(dataset_path, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] claim = line['claim'] ########################## pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue pos_set = set() for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add(get_whole_evidence([evidence_sentence], db)) for evidence_sentence_ref in line['predicted_evidence']: evidence_sentence = get_whole_evidence( [evidence_sentence_ref], db) dev.append((line['claim'], evidence_sentence)) indexes.append(evidence_sentence_ref) ########################## if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes
def generate_submission(_predictions, test_set_path, submission_path): """ Generate submission file for shared task: http://fever.ai/task.html :param _predictions: :param test_set_path: :param submission_path: :return: """ jlr = JSONLineReader() json_lines = jlr.read(test_set_path) with open(submission_path, 'w') as f: for _prediction, line in tqdm(zip(_predictions, json_lines)): for i, evidence in enumerate(line['predicted_evidence']): line['predicted_evidence'][i][0] = normalize(evidence[0]) obj = { "id": line['id'], "predicted_evidence": line['predicted_evidence'], "predicted_label": prediction_2_label(_prediction) } f.write(json.dumps(obj)) f.write('\n')
def dev_processing(self,data_path): jlr = JSONLineReader() with open(data_path,"r") as f: lines = jlr.process(f) devs = [] labels = [] for line in tqdm(lines): # if line['label'].upper() == "NOT ENOUGH INFO": # continue label, dev = [], [] pos_set_ref = line['evidence'] if len(pos_set_ref) == 0 or not pos_set_ref[0]: continue pos_set = set() for evidence_set in line['evidence']: for evidence_sentence in evidence_set: pos_set.add(self.get_whole_evidence([evidence_sentence], self.db)) for evidence_sentence_ref in line['predicted_evidence']: evidence_sentence = self.get_whole_evidence([evidence_sentence_ref], self.db) dev.append((line['claim'], evidence_sentence)) if evidence_sentence in pos_set: label.append(1) else: label.append(0) if len(dev) == 0 or len(label) == 0: continue devs.append(dev) labels.append(label) return devs,labels
def generate_prediction_files(predictions,p_sents_indexes,data_path,final_prediction_path): """ transform the generated predictions from classifier to lists of dicts form to feed into the score system :param predictions: :param p_sents_indexes: :param data_path: :param final_prediction_path: :return: """ jlr = JSONLineReader() final_predictions = [] with open(data_path,"r") as f: lines = jlr.process(f) print(len(predictions)) print(len(p_sents_indexes)) print(len(lines)) assert len(predictions) == len(p_sents_indexes) == len(lines) for idx,line in enumerate(lines): line['predicted_evidence'] = [] line['predicted_label'] = 'refutes' predicted_sents = predictions[idx] sents_indexes = p_sents_indexes[idx] for i in range(len(sents_indexes)): if predicted_sents[i] == 1: line['predicted_evidence'].append([sents_indexes[i][0],sents_indexes[i][1]]) final_predictions.append(line) with open(final_prediction_path,"w") as f: for prediction in final_predictions: f.write(json.dumps(prediction)+'\n') return final_predictions
def __init__(self, db: FeverDocDB, wiki_tokenizer: Tokenizer = None, claim_tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: self._wiki_tokenizer = wiki_tokenizer or WordTokenizer() self._claim_tokenizer = claim_tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.db = db self.formatter = FEVERSentenceFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema()) self.reader = JSONLineReader()
def predict_processing(self, datapath): jlr = JSONLineReader() devs = [] all_indexes = [] with open(datapath, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] pages = set() # pages = line['predicted_pages'] pages.update(page for page in line['predicted_pages']) # if len(pages) == 0: # pages.add("Michael_Hutchence") claim = line['claim'] p_lines = [] #Separa sentences and add predict page for page in pages: doc_lines = self.db.get_doc_lines(page) if not doc_lines: continue p_lines.extend(self.get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((claim, doc_line[0])) indexes.append((doc_line[1], doc_line[2])) # print(len(dev)) if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes
def tfidf_test_processing(base_path, dbfilename, test_data_path, test_store_path, pro_extract_sents_path, h_max_length, s_max_length, iword_dict): dev_index_path = os.path.join( base_path, "data/train_data/dev.h_{}.s_{}.tfidf.indexes.p".format( h_max_length, s_max_length)) devs, location_indexes = dev_data_loader(test_store_path, dbfilename, test_data_path) if os.path.exists(dev_index_path): with open(dev_index_path, "rb") as f: devs_indexes = pickle.load(f) else: with open(pro_extract_sents_path, "r") as f: jlr = JSONLineReader() lines = jlr.process(f) inputs = [] new_location_indexes = [] for i, line in enumerate(lines): pro_extract_sents = [] sent_index = [] predict_sents = line['predicted_sentences'] claim = line['claim'] predict_sents_set = set([ (doc_id, sent_num) for doc_id, sent_num in predict_sents ]) # print(predict_sents_set) for j, index in enumerate(location_indexes[i]): if (index[0], index[1]) in predict_sents_set: # print(devs[i][j]) # print(devs[i]) pro_extract_sents.append((claim, devs[i][j][1])) sent_index.append((index[0], index[1])) inputs.append(pro_extract_sents) new_location_indexes.append(sent_index) devs_indexes = test_data_indexes(inputs, iword_dict, h_max_length, s_max_length) return devs_indexes, new_location_indexes
def dev_processing(db_filename, datapath): db = FeverDocDB(db_filename) jlr = JSONLineReader() devs = [] all_indexes = [] with open(datapath, "rb") as f: lines = jlr.process(f) for line in tqdm(lines): dev = [] indexes = [] pages = set() pages.update(page[0] for page in line['predicted_pages']) if len(pages) == 0: pages.add("Michael_Hutchence") claim = line['claim'] p_lines = [] for page in pages: doc_lines = db.get_doc_lines(page) if not doc_lines: continue p_lines.extend(get_valid_texts(doc_lines, page)) for doc_line in p_lines: if not doc_line[0]: continue dev.append((claim, doc_line[0])) indexes.append((doc_line[1], doc_line[2])) # print(len(dev)) if len(dev) == 0: dev.append((claim, 'no evidence for this claim')) indexes.append(('empty', 0)) devs.append(dev) all_indexes.append(indexes) return devs, all_indexes
def __init__(self, db: Union[FeverDocDB, SnopesDocDB], sentence_level=False, wiki_tokenizer: Tokenizer = None, claim_tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, filtering: str = None) -> None: self._sentence_level = sentence_level self._wiki_tokenizer = wiki_tokenizer or WordTokenizer() self._claim_tokenizer = claim_tokenizer or WordTokenizer() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self.db = db self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema(), filtering=filtering) self.reader = JSONLineReader()
default='simple', help=("String option specifying tokenizer type to use " "(e.g. 'corenlp')")) parser.add_argument('--num-workers', type=int, default=None, help='Number of CPU processes (for tokenizing, etc)') args = parser.parse_args() doc_freqs = None if args.use_precomputed: _, metadata = utils.load_sparse_csr(args.model) doc_freqs = metadata['doc_freqs'].squeeze() db = FeverDocDB("data/fever/fever.db") jlr = JSONLineReader() formatter = FEVERGoldFormatter(set(), FEVERLabelSchema()) jlr = JSONLineReader() with open(args.in_file, "r") as f, open( "data/fever/{0}.sentences.{3}.p{1}.s{2}.jsonl".format( args.split, args.max_page, args.max_sent, "precomputed" if args.use_precomputed else "not_precomputed"), "w+") as out_file: lines = jlr.process(f) #lines = tf_idf_claims_batch(lines) for line in tqdm(lines): line = tf_idf_claim(line) out_file.write(json.dumps(line) + "\n")
max_evidence] _line['scores'] = _line['scores'][:args.max_evidence] return _line if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input', help='/path/to/input/file') parser.add_argument('output', help='/path/to/output/file') parser.add_argument('--max_evidence', help='max num of evidences', type=int, default=5) args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger("replace_noise_dataset") random.seed(55) jlr = JSONLineReader() lines = jlr.read(args.input) counter = 0 with open(args.output, 'w') as f: for i, line in tqdm(enumerate(lines)): if not line[ 'label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted( line): counter += 1 logger.info("line " + str(i + 1) + " should be filled") line = random_fill_gold_evidence(line) f.write(json.dumps(line) + '\n') logger.info(str(counter) + " samples filled with gold evidence")
for sample in samples: evidence_pages.add(sample) elif len(evidence_pages) >= k: samples = random.sample(evidence_pages, k) evidence_pages = set(samples) return evidence_pages path = os.getcwd() path = re.sub("/src.*", "", path) db = FeverDocDB(os.path.join(path, "data/fever/fever.db")) doc_ids = db.get_doc_ids() doc_ids = doc_ids[1:] jlr = JSONLineReader() # with open(os.path.join(path, "data/fever-data/train.jsonl"), "r") as f: # with open(os.path.join(path, 'data/fever/train.p5.jsonl'), "w") as f2: # lines = f.readlines() # for line in lines: # js = json.loads(line) # pages = sample_doc(js,doc_ids,k=5) # js['predicted_pages'] = list(pages) # f2.write(json.dumps(js)+"\n") with open(os.path.join(path, "data/fever-data/dev.jsonl"), "r") as f: with open(os.path.join(path, "data/fever/dev.p5.jsonl"), "w") as f2: lines = f.readlines() for line in lines: js = json.loads(line) pages = sample_doc(js, doc_ids, k=5)