def read(self, input_file, sentence_id_file: str = None) -> List[QAFullExample]: logger.info(f'Reading data from {input_file}') with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F: return True return False if sentence_id_file is not None: with open(sentence_id_file, 'r') as f: sentence_ids = json.load(f) else: sentence_ids = None examples = [] for instance in data: article = instance['article'] data_id = instance['id'] question = instance['question'] answer = instance['answer'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in article: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) sentence_start_list, sentence_end_list = utils.split_sentence(article, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) sentence_id = instance['id'] if sentence_ids is None else sentence_ids[data_id] example = QAFullExample( qas_id=data_id, question_text=question, doc_tokens=doc_tokens, sentence_span_list=sentence_span_list, sentence_id=sentence_id, is_impossible=answer ) examples.append(example) return examples
def read(self, input_file): """Read a SQuAD json file into a list of SquadExample.""" logger.info('Reading data set from {}...'.format(input_file)) with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader) def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F: return True return False examples = [] for instance in tqdm(input_data): passage = instance['article'] article_id = instance['id'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in passage: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence(passage, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) questions = instance['questions'] answers = list(map(lambda x: {'A': 0, 'B': 1, 'C': 2, 'D': 3}[x], instance['answers'])) options = instance['options'] for q_id, (question, answer, option_list) in enumerate(zip(questions, answers, options)): qas_id = f"{article_id}--{q_id}" example = MultiChoiceFullExample( qas_id=qas_id, question_text=question, options=option_list, doc_tokens=doc_tokens, sentence_span_list=sentence_span_list, answer=answer ) examples.append(example) logger.info('Finish reading {} examples from {}'.format(len(examples), input_file)) return examples
def read(self, input_file): """ :param input_file: input file to load data. The format is in CoQA style """ logger.info('Reading data set from {}...'.format(input_file)) with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)['data'] def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord( ch) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence( paragraph_text, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] sentence_id = utils.find_evidence_sentence( sentence_span_list, start_position, end_position) example = SQuADFullExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, sentence_span_list=sentence_span_list, orig_answer_text="", start_position=None, end_position=None, sentence_id=sentence_id, is_impossible=-1, ral_start_position=start_position, ral_end_position=end_position) examples.append(example) return examples
def read(self, input_file, dialog_turns: int = 2) -> List[QAFullExample]: """ :param input_file: input file to load data. The format is in CoQA style :param read_state: If read extra sentences from CoQA dataset. :param dialog_turns: """ logger.info('Reading data set from {}...'.format(input_file)) logger.info('Read parameters:') logger.info('Dialog turns: {}'.format(dialog_turns)) # logger.info('Read state: {}'.format(read_state)) # assert read_state in ReadState with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)['data'] def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord( ch) == 0x202F: return True return False examples = [] for paragraph in input_data: paragraph_text = paragraph["story"] story_id = paragraph['id'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence( paragraph_text, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) doc_sentence_tokens = [ doc_tokens[span[0]:(span[1] + 1)] for span in sentence_span_list ] questions = paragraph['questions'] answers = paragraph['answers'] for i, (question, answer) in enumerate(zip(questions, answers)): question_text = question['input_text'] # We are only concerned about questions with Yes/No as answers answer_type = utils.normalize_answer(answer['input_text']) if answer_type not in ['yes', 'no']: continue if answer_type == 'yes': answer_choice = 0 else: answer_choice = 1 for j in range(dialog_turns): pre_idx = i - (j + 1) if pre_idx >= 0: question_text = questions[pre_idx][ 'input_text'] + '<Q>' + answers[pre_idx][ 'input_text'] + '<A>' + question_text qas_id = story_id + '--' + str(i + 1) # Add rationale start and end as extra supervised label. rationale_start_position = char_to_word_offset[ answer['span_start']] rationale_end_position = char_to_word_offset[answer['span_end'] - 1] sentence_id = utils.find_evidence_sentence( sentence_span_list, rationale_start_position, rationale_end_position) example = QAFullExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_sentence_tokens, sentence_span_list=sentence_span_list, orig_answer_text="", start_position=None, end_position=None, sentence_id=sentence_id, is_impossible=answer_choice, ral_start_position=rationale_start_position, ral_end_position=rationale_end_position) examples.append(example) return examples
def read(self, input_file): """Read a SQuAD json file into a list of SquadExample.""" logger.info('Reading data set from {}...'.format(input_file)) with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader) def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F: return True return False examples = [] for articles, qas_id, question, yes_no in tqdm( zip(input_data['passages'], input_data['ids'], input_data['questions'], input_data['yes_no'])): # Read all passages. passage = '' for doc in articles: passage = passage + doc['text'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in passage: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence(passage, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) if yes_no == 'yes': is_impossible = 0 elif yes_no == 'no': is_impossible = 1 else: raise RuntimeError(f'Wrong yes_no type : {yes_no}') example = SQuADFullExample( qas_id=qas_id, question_text=question, doc_tokens=doc_tokens, sentence_span_list=sentence_span_list, orig_answer_text="", start_position=None, end_position=None, sentence_id=None, is_impossible=is_impossible, ral_start_position=None, ral_end_position=None) examples.append(example) logger.info('Finish reading {} examples from {}'.format(len(examples), input_file)) return examples
def read(self, input_file): logger.info(f'Reading data set from {input_file}...') with open(input_file, 'r') as f: data = json.load(f) def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F: return True return False examples = [] for instance_id in tqdm(data, desc=f'Reading examples from {input_file}...', total=len(data)): claim = data[instance_id]['claim'] sentence_id = data[instance_id]['evidence'] label = data[instance_id]['label'].lower() passage = data[instance_id]['passage'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in passage: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence(passage, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) if label == 'yes': answer_choice = 0 elif label == 'no': answer_choice = 1 else: raise RuntimeError(f'Wrong label for {label}') example = QAFullExample( qas_id=instance_id, question_text=claim, doc_tokens=doc_tokens, sentence_span_list=sentence_span_list, orig_answer_text="", start_position=None, end_position=None, sentence_id=sentence_id, is_impossible=answer_choice, ral_start_position=None, ral_end_position=None ) examples.append(example) return examples
def read(self, input_file, dialog_turns: int = 2, remove_evidence=False, remove_question=False, remove_passage=False, remove_dict=None): """ :param input_file: input file to load data. The format is in CoQA style :param dialog_turns: Decide how many turns' questions and answers will be appended before current question. """ logger.info('Reading data set from {}...'.format(input_file)) logger.info('Read parameters:') logger.info('Dialog turns: {}'.format(dialog_turns)) logger.info('Remove evidence during test: {}'.format(remove_evidence)) logger.info('Remove dict: {}'.format(remove_dict)) with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)['data'] def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord( ch) == 0x202F: return True return False if remove_dict is None: remove_dict = {} else: remove_dict = json.load(open(remove_dict, 'r')) logger.info(len(remove_dict)) rule_labels_acc = utils.AverageMeter() examples = [] for paragraph in input_data: paragraph_text = paragraph["story"] story_id = paragraph['id'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence( paragraph_text, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) questions = paragraph['questions'] answers = paragraph['answers'] for i, (question, answer) in enumerate(zip(questions, answers)): question_text = question['input_text'] # We are only concerned about questions with Yes/No as answers answer_type = utils.normalize_answer(answer['input_text']) if answer_type not in ['yes', 'no']: continue if story_id in remove_dict and str(i + 1) in remove_dict[story_id]: continue if answer_type == 'yes': answer_choice = 0 else: answer_choice = 1 for j in range(dialog_turns): pre_idx = i - (j + 1) if pre_idx >= 0: question_text = questions[pre_idx][ 'input_text'] + '<Q>' + answers[pre_idx][ 'input_text'] + '<A>' + question_text qas_id = story_id + '--' + str(i + 1) # Add rationale start and end as extra supervised label. rationale_start_position = char_to_word_offset[ answer['span_start']] rationale_end_position = char_to_word_offset[answer['span_end'] - 1] sentence_id = utils.find_evidence_sentence( sentence_span_list, rationale_start_position, rationale_end_position) # Remove evidence sentence for experiments while evaluation only. if remove_evidence and sentence_id != -1 and 'train' not in input_file: evi_token_s, evi_token_e = sentence_span_list[sentence_id] new_doc_tokens = doc_tokens[:evi_token_s] + doc_tokens[ (evi_token_e + 1):] rationale_start_position = rationale_end_position = -1 reduce_offset = evi_token_e - evi_token_s + 1 new_sentence_span_list = sentence_span_list[:sentence_id] + [ (s - reduce_offset, e - reduce_offset) for s, e in sentence_span_list[(sentence_id + 1):] ] sentence_id = -1 else: new_doc_tokens = doc_tokens new_sentence_span_list = sentence_span_list if 'sentence_id' in question: pseudo_sentence_id = question['sentence_id'] if pseudo_sentence_id == sentence_id: rule_labels_acc.update(1) else: rule_labels_acc.update(0) sentence_id = pseudo_sentence_id # example = SQuADFullExample( # qas_id=qas_id, # question_text=question_text, # doc_tokens=doc_tokens, # sentence_span_list=sentence_span_list, # orig_answer_text="", # start_position=None, # end_position=None, # sentence_id=sentence_id, # is_impossible=answer_choice, # ral_start_position=rationale_start_position, # ral_end_position=rationale_end_position) example = SQuADFullExample( qas_id=qas_id, question_text=question_text, doc_tokens=new_doc_tokens, sentence_span_list=new_sentence_span_list, orig_answer_text="", start_position=None, end_position=None, sentence_id=sentence_id, is_impossible=answer_choice, ral_start_position=rationale_start_position, ral_end_position=rationale_end_position) examples.append(example) if rule_labels_acc.count > 0: logger.info('Read labels generated by rules.') logger.info(f'Accuracy of labels: {rule_labels_acc.avg}') return examples
def read(self, input_file, read_state, sample_ratio: float = 0.5, dialog_turns: int = 2, extra_sen_file: str = None) -> List[QAFullExample]: """ :param input_file: input file to load data. The format is in CoQA style :param read_state: If read extra sentences from CoQA dataset. :param sample_ratio: the ratio of negative sampling. :param dialog_turns: Decide how many turns' questions and answers will be appended before current question. :param extra_sen_file: If read_extra_self is False, then this parameter must be specified as the way path for extra sentence file. """ logger.info('Reading data set from {}...'.format(input_file)) logger.info('Read parameters:') logger.info('Dialog turns: {}'.format(dialog_turns)) logger.info('Read state: {}'.format(read_state)) logger.info('Sample ratio: {}'.format(sample_ratio)) logger.info('Extra sentence file: {}'.format(extra_sen_file)) assert read_state in ReadState with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)['data'] def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F: return True return False all_sentences = [] if read_state == ReadState.SampleFromSelf: for paragraph in input_data: for sentence in self.sentence_tokenizer.tokenize(paragraph['story']): sentence_tokens = whitespace_tokenize(sentence) if sentence_tokens: all_sentences.append(sentence_tokens) else: logger.warning('Empty sentence!') # all_sentences.extend( # [whitespace_tokenize(sentence) for sentence in self.sentence_tokenizer.tokenize(paragraph['story'])]) elif read_state == ReadState.SampleFromExternal: pass logger.info('Read extra sentences: {}'.format(len(all_sentences))) examples = [] for paragraph in input_data: paragraph_text = paragraph["story"] story_id = paragraph['id'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence(paragraph_text, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) doc_sentence_tokens = [doc_tokens[span[0]: (span[1] + 1)] for span in sentence_span_list] questions = paragraph['questions'] answers = paragraph['answers'] for i, (question, answer) in enumerate(zip(questions, answers)): question_text = question['input_text'] # We are only concerned about questions with Yes/No as answers answer_type = utils.normalize_answer(answer['input_text']) if answer_type not in ['yes', 'no']: continue if answer_type == 'yes': answer_choice = 0 else: answer_choice = 1 for j in range(dialog_turns): pre_idx = i - (j + 1) if pre_idx >= 0: question_text = questions[pre_idx]['input_text'] + '<Q>' + answers[pre_idx][ 'input_text'] + '<A>' + question_text qas_id = story_id + '--' + str(i + 1) # Add rationale start and end as extra supervised label. rationale_start_position = char_to_word_offset[answer['span_start']] rationale_end_position = char_to_word_offset[answer['span_end'] - 1] sentence_id = utils.find_evidence_sentence(sentence_span_list, rationale_start_position, rationale_end_position) # Add negative samples if read_state != ReadState.NoNegative: new_doc_tokens, sentence_label, new_sentence_id, sentence_span_list, orig_token_map = \ utils.generate_seq_with_negative_sample(doc_sentence_tokens, all_sentences, sample_ratio, target_index=sentence_id) rationale_start_position = orig_token_map[rationale_start_position] rationale_end_position = orig_token_map[rationale_end_position] else: new_doc_tokens = doc_tokens sentence_label = [0] * len(sentence_span_list) new_sentence_id = sentence_id example = QAFullExample( qas_id=qas_id, question_text=question_text, doc_tokens=new_doc_tokens, sentence_span_list=sentence_span_list, orig_answer_text="", start_position=None, end_position=None, sentence_id=new_sentence_id, is_impossible=answer_choice, ral_start_position=rationale_start_position, ral_end_position=rationale_end_position, meta_data={'sentence_label': sentence_label}) examples.append(example) return examples
def read(self, input_file): """ :param input_file: input file to load data. The format is in BoolQ style :param dialog_turns: Decide how many turns' questions and answers will be appended before current question. """ logger.info('Reading data set from {}...'.format(input_file)) with open(input_file, "r", encoding='utf-8') as reader: input_data = [] for line in reader: item = json.loads(line) item['id'] = str(len(input_data)) input_data.append(item) def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F: return True return False examples = [] for paragraph in input_data: paragraph_text = paragraph["passage"] story_id = paragraph['id'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence(paragraph_text, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) question_text = paragraph['question'] answer_text = ('yes' if paragraph['answer'] else 'no') # We are only concerned about questions with Yes/No as answers answer_type = answer_text if answer_type not in ['yes', 'no']: continue if answer_type == 'yes': answer_choice = 0 else: answer_choice = 1 qas_id = story_id sentence_id = -1 if 'sentence_id' in paragraph: sentence_id = paragraph['sentence_id'] example = BoolQFullExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, sentence_span_list=sentence_span_list, orig_answer_text="", start_position=None, end_position=None, sentence_id=sentence_id, is_impossible=answer_choice) examples.append(example) return examples