def convert_run(args): doc_sentences = {} rankings = {} # read in input run file and save rankings to dict with open(args.input_run_file, 'r', encoding='utf-8') as f: print('Reading run file...') for line in f: query_id, doc_id, rank = line.strip().split('\t') if doc_id not in doc_sentences: doc_sentences[doc_id] = [] if query_id not in rankings: rankings[query_id] = [] rankings[query_id].append(doc_id) # read through all wiki dump files and save sentence IDs for involved docs print('Reading wiki pages...') for file in os.listdir(args.collection_folder): with open(os.path.join(args.collection_folder, file), 'r', encoding='utf-8') as f: for line in f: line_json = json.loads(line.strip()) if line_json['id'] in doc_sentences: sent_ids = [id for sent, id in extract_sentences(line_json['lines']) if sent] doc_sentences[line_json['id']].extend(sent_ids) # write expanded sentence IDs to output run file with open(args.output_run_file, 'w', encoding='utf-8') as f: print('Writing sentences to run file...') for query_id, doc_ids in rankings.items(): query_index = 1 for doc_id in doc_ids[:args.k]: for sent_num in doc_sentences[doc_id]: sent_id = make_sentence_id(doc_id, sent_num) f.write(f'{query_id}\t{sent_id}\t{query_index}\n') query_index += 1
def convert_run(args): queries = {} labels = {} evidences = {} docs = {} num_truncated = 0 # read in dataset file and save queries to dicts with open(args.dataset_file, 'r', encoding='utf-8') as f: print('Reading FEVER dataset file...') for line in f: line_json = json.loads(line.strip()) query_id = line_json['id'] query = line_json['claim'] queries[query_id] = query if args.has_labels: label = line_json['label'] if label == 'SUPPORTS': labels[query_id] = 'true' elif label == 'REFUTES': labels[query_id] = 'false' else: # label == 'NOT ENOUGH INFO' labels[query_id] = 'weak' def generate_samples(query_id, pred_sent_ids): evidence_sets = [] if args.format == 'concat': evidence_sets = [[sent_id for sent_id in pred_sent_ids]] elif args.format == 'agg': evidence_sets = [[sent_id] for sent_id in pred_sent_ids] else: # args.format == 'seq': curr_preds = [] for sent_id in pred_sent_ids: curr_preds.append(sent_id) evidence_sets.append([pred for pred in curr_preds]) return evidence_sets # read in run file and take top run file ranking predictions with open(args.run_file, 'r', encoding='utf-8') as f: print('Reading run file...') curr_query = None pred_sent_ids = [] for line in f: query_id, sent_id, rank = line.strip().split('\t') query_id = int(query_id) # if we reach a new query in the run file, generate samples for previous query if query_id != curr_query: if curr_query is not None: evidences[curr_query] = generate_samples( curr_query, pred_sent_ids) curr_query = query_id pred_sent_ids.clear() if int(rank) <= args.max_evidences: doc_id, _ = split_sentence_id(sent_id) docs[doc_id] = 'N/A' # placeholder pred_sent_ids.append(sent_id) # handle the final query evidences[curr_query] = generate_samples(curr_query, pred_sent_ids) # read through all wiki dump files and save doc text for involved docs print('Reading wiki pages...') for file in os.listdir(args.collection_folder): with open(os.path.join(args.collection_folder, file), 'r', encoding='utf-8') as f: for line in f: line_json = json.loads(line.strip()) if line_json['id'] in docs: docs[line_json['id']] = line_json['lines'] # write query-doc text pairs to files with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \ open(args.output_text_file, 'w', encoding='utf-8') as f_text: print('Writing query-doc pairs to files...') for query_id, query_text in queries.items(): if args.has_labels: label = labels[query_id] for evidence_ids in evidences[query_id]: evidence_texts = [] for evidence in evidence_ids: # get specific sentence from within doc_text doc_id, sent_num = split_sentence_id(evidence) entity = doc_id.replace( '_', ' ') # prepend entity name to document text doc_text = docs[doc_id] sent_text, _ = extract_sentences(doc_text)[sent_num] evidence_texts.append( f'{normalize_text(entity)} . {normalize_text(sent_text)}' ) # format evidence ids and texts in proper format evidence_ids_str = ' '.join(evidence_ids) prefixed_evidence_texts = [] for i, evidence_text in enumerate(evidence_texts): if args.format == 'agg': prefixed_evidence_texts.append( f'premise: {evidence_text}') else: truncated_text, num_truncated = truncate( query_text, evidence_text, args.max_evidences, args.max_seq_len, num_truncated) prefixed_evidence_texts.append( f'sentence{i + 1}: {truncated_text}') evidence_texts_str = ' '.join(prefixed_evidence_texts) if args.has_labels: f_id.write(f'{query_id}\t{evidence_ids_str}\t{label}\n') else: f_id.write(f'{query_id}\t{evidence_ids_str}\n') f_text.write( f'hypothesis: {query_text} {evidence_texts_str}\n') print(f'Number of sentences truncated: {num_truncated}')
def generate_data(args): queries = {} evidences = {} pred_evidences = {} docs = {} # read in dataset file and save queries and evidences to dicts with open(args.dataset_file, 'r', encoding='utf-8') as f: print('Reading FEVER dataset file...') for line in f: line_json = json.loads(line.strip()) query_id = line_json['id'] query = line_json['claim'] queries[query_id] = query # only save evidences for non-test sets and non-NEI queries deduped_evidence_set = set() if line_json['label'] != 'NOT ENOUGH INFO': for annotator in line_json['evidence']: for evidence in annotator: evidence[2] = ftfy.fix_text(evidence[2]) docs[evidence[2]] = 'N/A' # placeholder deduped_evidence_set.add( make_sentence_id(evidence[2], evidence[3])) evidences[query_id] = deduped_evidence_set def generate_samples(query_id, pred_sent_ids): curr_pred_evidences = [] # include all ground truth relevant evidences as positive samples for sent_id in evidences[query_id]: curr_pred_evidences.append(sent_id) # sample negative evidences from pred_sent_ids neg_pred_sent_ids = [ pred for pred in pred_sent_ids if pred not in evidences[query_id] ] neg_sent_ids = random.sample( neg_pred_sent_ids, min(len(evidences[query_id]), len(neg_pred_sent_ids))) for sent_id in neg_sent_ids: doc_id, _ = split_sentence_id(sent_id) docs[doc_id] = 'N/A' # placeholder curr_pred_evidences.append(sent_id) return curr_pred_evidences # read in run file and negative sample using run file ranking predictions with open(args.run_file, 'r', encoding='utf-8') as f: print('Reading run file...') curr_query = None pred_sent_ids = [] for line in f: query_id, sent_id, rank = line.strip().split('\t') query_id = int(query_id) # if we reach a new query in the run file, perform sampling for the previous query if query_id != curr_query: if curr_query is not None: pred_evidences[curr_query] = generate_samples( curr_query, pred_sent_ids) curr_query = query_id pred_sent_ids.clear() if args.min_rank <= int(rank) <= args.max_rank: pred_sent_ids.append(sent_id) # perform sampling for the final query pred_evidences[curr_query] = generate_samples(curr_query, pred_sent_ids) # read through all wiki dump files and save doc text for involved docs print('Reading wiki pages...') for file in os.listdir(args.collection_folder): with open(os.path.join(args.collection_folder, file), 'r', encoding='utf-8') as f: for line in f: line_json = json.loads(line.strip()) if line_json['id'] in docs: docs[line_json['id']] = line_json['lines'] # write query-doc text pairs to files with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \ open(args.output_text_file, 'w', encoding='utf-8') as f_text: print('Writing query-doc pairs to files...') for query_id, sent_ids in pred_evidences.items(): query_text = queries[query_id] for rank, sent_id in enumerate(sent_ids): relevance = 'true' if sent_id in evidences[ query_id] else 'false' # get specific sentence from within doc_text doc_id, sent_num = split_sentence_id(sent_id) entity = doc_id.replace( '_', ' ') # prepend entity name to document text doc_text = docs[doc_id] sent_text, _ = extract_sentences(doc_text)[sent_num] f_id.write(f'{query_id}\t{sent_id}\t{rank + 1}\n') f_text.write( f'Query: {query_text} Document: {entity} . {normalize_text(sent_text)} Relevant:\t{relevance}\n' )
def convert_run(args): queries = {} evidences = {} pred_evidences = {} docs = {} # read in dataset file and save queries and evidences to dicts with open(args.dataset_file, 'r', encoding='utf-8') as f: print('Reading FEVER dataset file...') for line in f: line_json = json.loads(line.strip()) query_id = line_json['id'] query = line_json['claim'] queries[query_id] = query # only save evidences for non-test sets and non-NEI queries deduped_evidence_set = set() if args.has_labels and line_json['label'] != 'NOT ENOUGH INFO': for annotator in line_json['evidence']: for evidence in annotator: evidence[2] = ftfy.fix_text(evidence[2]) docs[evidence[2]] = 'N/A' # placeholder deduped_evidence_set.add( make_sentence_id(evidence[2], evidence[3])) evidences[query_id] = deduped_evidence_set # read in run file and save rankings to dict with open(args.run_file, 'r', encoding='utf-8') as f: print('Reading run file...') for line in f: query_id, sent_id, rank = line.strip().split('\t') query_id = int(query_id) doc_id, _ = split_sentence_id(sent_id) docs[doc_id] = 'N/A' # placeholder if query_id not in pred_evidences: pred_evidences[query_id] = [] if args.k is None or int(rank) <= args.k: pred_evidences[query_id].append(sent_id) # read through all wiki dump files and save doc text for involved docs print('Reading wiki pages...') for file in os.listdir(args.collection_folder): with open(os.path.join(args.collection_folder, file), 'r', encoding='utf-8') as f: for line in f: line_json = json.loads(line.strip()) if line_json['id'] in docs: docs[line_json['id']] = line_json['lines'] # write query-doc pairs to files with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \ open(args.output_text_file, 'w', encoding='utf-8') as f_text: print('Writing query-doc pairs to files...') for query_id, sent_ids in pred_evidences.items(): query_text = queries[query_id] if args.type == 'mono': if args.ner: ner_entities = extract_entities(query_text) for rank, sent_id in enumerate(sent_ids): if args.has_labels: relevance = 'true' if sent_id in evidences[ query_id] else 'false' # get specific sentence from within doc_text doc_id, sent_num = split_sentence_id(sent_id) entity = doc_id.replace( '_', ' ') # prepend entity name to document text doc_text = docs[doc_id] sent_text, _ = extract_sentences(doc_text)[sent_num] # write query-doc pair ids and texts if args.has_labels: f_id.write( f'{query_id}\t{sent_id}\t{rank + 1}\t{relevance}\n' ) else: f_id.write(f'{query_id}\t{sent_id}\t{rank + 1}\n') if args.ner: numbered_entities = [ f'Entity{i + 1}: {entity}' for i, entity in enumerate(ner_entities) ] entities_str = ' '.join(numbered_entities) f_text.write( f'Query: {query_text} Document: {entity} . {normalize_text(sent_text)} {entities_str} Relevant:\n' ) else: f_text.write( f'Query: {query_text} Document: {entity} . {normalize_text(sent_text)} Relevant:\n' ) else: # args.type == 'duo' ranked_sent_ids = [(sent_id, i) for i, sent_id in enumerate(sent_ids)] for (sent_id_1, rank_1), (sent_id_2, rank_2) in itertools.permutations( ranked_sent_ids, 2): if args.has_labels: relevance = 'true' if sent_id_1 in evidences[ query_id] else 'false' # get specific sentence from within doc_text doc_id_1, sent_1_num = split_sentence_id(sent_id_1) entity_1 = doc_id_1.replace( '_', ' ') # prepend entity name to document text doc_text_1 = docs[doc_id_1] sent_1_text, _ = extract_sentences(doc_text_1)[sent_1_num] doc_id_2, sent_2_num = split_sentence_id(sent_id_2) entity_2 = doc_id_2.replace( '_', ' ') # prepend entity name to document text doc_text_2 = docs[doc_id_2] sent_2_text, _ = extract_sentences(doc_text_2)[sent_2_num] # write query-doc pair ids and texts if args.has_labels: f_id.write( f'{query_id}\t{sent_id_1}\t{rank_1 + 1}\t{sent_id_2}\t{rank_2 + 1}\t{relevance}\n' ) else: f_id.write( f'{query_id}\t{sent_id_1}\t{rank_1 + 1}\t{sent_id_2}\t{rank_2 + 1}\n' ) f_text.write( f'Query: {query_text} Document1: {entity_1} . {normalize_text(sent_1_text)} Document2: {entity_2} . {normalize_text(sent_2_text)} Relevant:\n' )
def generate_data(args): queries = {} labels = {} evidences = {} docs = {} num_truncated = 0 # read in dataset file and save queries and evidences to dicts with open(args.dataset_file, 'r', encoding='utf-8') as f: print('Reading FEVER dataset file...') for line in f: line_json = json.loads(line.strip()) query_id = line_json['id'] query = line_json['claim'] queries[query_id] = query label = line_json['label'] if label == 'SUPPORTS': labels[query_id] = 'true' elif label == 'REFUTES': labels[query_id] = 'false' else: # label == 'NOT ENOUGH INFO' labels[query_id] = 'weak' annotators = [] if label != 'NOT ENOUGH INFO': # no evidence set for NEI queries, will sample from run files later for annotator in line_json['evidence']: evidence_set = [] for evidence in annotator: evidence[2] = ftfy.fix_text(evidence[2]) docs[evidence[2]] = 'N/A' # placeholder evidence_set.append( make_sentence_id(evidence[2], evidence[3])) annotators.append(evidence_set) evidences[query_id] = annotators # samples evidence from pred_sent_ids def negative_sample(query_id, pred_sent_ids): neg_sent_ids = random.sample(pred_sent_ids, random.randint(1, args.max_evidences)) for sent_id in neg_sent_ids: doc_id, _ = split_sentence_id(sent_id) docs[doc_id] = 'N/A' # placeholder return [neg_sent_ids] # read in run file and sample run file ranking predictions for queries with open(args.run_file, 'r', encoding='utf-8') as f: print('Reading run file...') curr_query = None pred_sent_ids = [] for line in f: query_id, sent_id, rank = line.strip().split('\t') query_id = int(query_id) # if we reach a new query in the run file, perform sampling for previous query if needed if query_id != curr_query: if curr_query is not None and len(evidences[curr_query]) == 0: evidences[curr_query] = negative_sample( curr_query, pred_sent_ids) curr_query = query_id pred_sent_ids.clear() if args.min_rank <= int(rank) <= args.max_rank: pred_sent_ids.append(sent_id) # handle the final query if len(evidences[curr_query]) == 0: evidences[curr_query] = negative_sample(curr_query, pred_sent_ids) # read through all wiki dump files and save doc text for involved docs print('Reading wiki pages...') for file in os.listdir(args.collection_folder): with open(os.path.join(args.collection_folder, file), 'r', encoding='utf-8') as f: for line in f: line_json = json.loads(line.strip()) if line_json['id'] in docs: docs[line_json['id']] = line_json['lines'] # write query-doc text pairs to files with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \ open(args.output_text_file, 'w', encoding='utf-8') as f_text: print('Writing query-doc pairs to files...') for query_id, query_text in queries.items(): label = labels[query_id] for evidence_ids in evidences[query_id]: evidence_texts = [] for evidence in evidence_ids: # get specific sentence from within doc_text doc_id, sent_num = split_sentence_id(evidence) entity = doc_id.replace( '_', ' ') # prepend entity name to document text doc_text = docs[doc_id] sent_text, _ = extract_sentences(doc_text)[sent_num] evidence_texts.append( f'{normalize_text(entity)} . {normalize_text(sent_text)}' ) if args.format == 'concat': evidence_ids_str = ' '.join(evidence_ids) prefixed_evidence_texts = [] for i, evidence_text in enumerate(evidence_texts): truncated_text, num_truncated = truncate( query_text, evidence_text, args.max_evidences, args.max_seq_len, num_truncated) prefixed_evidence_texts.append( f'sentence{i + 1}: {truncated_text}') evidence_texts_str = ' '.join(prefixed_evidence_texts) f_id.write(f'{query_id}\t{evidence_ids_str}\n') f_text.write( f'hypothesis: {query_text} {evidence_texts_str}\t{label}\n' ) else: # args.format == 'agg' for evidence_id, evidence_text in zip( evidence_ids, evidence_texts): f_id.write(f'{query_id}\t{evidence_id}\n') f_text.write( f'hypothesis: {query_text} premise: {evidence_text}\t{label}\n' ) print(f'Number of sentences truncated: {num_truncated}')
def generate_data(args): queries = {} labels = {} evidences = {} evidence_relevances = {} docs = {} num_truncated = 0 # read in dataset file and save queries and evidences to dicts with open(args.dataset_file, 'r', encoding='utf-8') as f: print('Reading FEVER dataset file...') for line in f: line_json = json.loads(line.strip()) query_id = line_json['id'] query = line_json['claim'] queries[query_id] = query label = line_json['label'] if label == 'SUPPORTS': labels[query_id] = 'true' elif label == 'REFUTES': labels[query_id] = 'false' else: # label == 'NOT ENOUGH INFO' labels[query_id] = 'weak' annotators = [] if label != 'NOT ENOUGH INFO': # no evidence set for NEI queries, will sample from run files later for annotator in line_json['evidence']: evidence_set = [] for evidence in annotator: evidence[2] = ftfy.fix_text(evidence[2]) evidence_set.append( make_sentence_id(evidence[2], evidence[3])) annotators.append(evidence_set) else: annotators.append([]) evidences[query_id] = annotators # for each evidence set, check if all gold evidences are in pred_sent_ids and randomly insert if not present def generate_samples(query_id, pred_sent_ids): all_sent_ids = [] all_relevances = [] for true_evidence_set in evidences[query_id]: sent_ids = [evidence for evidence in pred_sent_ids] relevances = [ int(evidence in true_evidence_set) for evidence in pred_sent_ids ] # randomly insert relevant evidences if query is not NEI and not all true evidences are in sent_ids if len(true_evidence_set) != 0 and len(true_evidence_set) != sum( relevances): for evidence in true_evidence_set: # stop inserting if all evidences are relevant if sum(relevances) == len(relevances): break if evidence not in sent_ids: doc_id, _ = split_sentence_id(evidence) docs[doc_id] = 'N/A' # placeholder overwrite_index = random.choice([ i for i in range(len(relevances)) if relevances[i] == 0 ]) sent_ids[overwrite_index] = evidence relevances[overwrite_index] = 1 all_sent_ids.append(sent_ids) all_relevances.append(relevances) return all_sent_ids, all_relevances # read in run file and sample run file ranking predictions for queries with open(args.run_file, 'r', encoding='utf-8') as f: print('Reading run file...') curr_query = None pred_sent_ids = [] for line in f: query_id, sent_id, rank = line.strip().split('\t') query_id = int(query_id) # if we reach a new query in the run file, perform sampling for previous query if needed if query_id != curr_query: if curr_query is not None: all_sent_ids, all_relevances = generate_samples( curr_query, pred_sent_ids) evidences[curr_query] = all_sent_ids evidence_relevances[curr_query] = all_relevances curr_query = query_id pred_sent_ids.clear() if int(rank) <= args.max_evidences: doc_id, _ = split_sentence_id(sent_id) docs[doc_id] = 'N/A' # placeholder pred_sent_ids.append(sent_id) # handle the final query all_sent_ids, all_relevances = generate_samples( curr_query, pred_sent_ids) evidences[curr_query] = all_sent_ids evidence_relevances[curr_query] = all_relevances # read through all wiki dump files and save doc text for involved docs print('Reading wiki pages...') for file in os.listdir(args.collection_folder): with open(os.path.join(args.collection_folder, file), 'r', encoding='utf-8') as f: for line in f: line_json = json.loads(line.strip()) if line_json['id'] in docs: docs[line_json['id']] = line_json['lines'] # write query-doc text pairs to files with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \ open(args.output_text_file, 'w', encoding='utf-8') as f_text: print('Writing query-doc pairs to files...') for query_id, query_text in queries.items(): label = labels[query_id] for evidence_ids, relevances in zip(evidences[query_id], evidence_relevances[query_id]): evidence_texts = [] for evidence in evidence_ids: # get specific sentence from within doc_text doc_id, sent_num = split_sentence_id(evidence) entity = doc_id.replace( '_', ' ') # prepend entity name to document text doc_text = docs[doc_id] sent_text, _ = extract_sentences(doc_text)[sent_num] evidence_texts.append( f'{normalize_text(entity)} . {normalize_text(sent_text)}' ) # format evidence ids and texts in proper format evidence_ids_str = ' '.join(evidence_ids) relevances_str = ','.join( [str(relevance) for relevance in relevances]) prefixed_evidence_texts = [] for i, evidence_text in enumerate(evidence_texts): truncated_text, num_truncated = truncate( query_text, evidence_text, args.max_evidences, args.max_seq_len, num_truncated) prefixed_evidence_texts.append( f'sentence{i + 1}: {truncated_text}') evidence_texts_str = ' '.join(prefixed_evidence_texts) f_id.write( f'{query_id}\t{evidence_ids_str}\t{relevances_str}\n') f_text.write( f'hypothesis: {query_text} {evidence_texts_str}\t{label}\n' ) print(f'Number of sentences truncated: {num_truncated}')