def _generate_examples(self, data_path: str, mapping_path: str): """Yields examples.""" with tf.io.gfile.GFile(mapping_path, "r") as f: ids = set(f.read().splitlines()) for k, ex in qa_utils.generate_squadlike_examples(data_path): if k in ids: yield k, ex
def test_generate_squadlike_examples(self): filepath = os.path.join(testing.test_utils.fake_examples_dir(), 'xquad', 'translate-test.json') examples = qa_utils.generate_squadlike_examples(filepath) self.assertEqual(list(examples), [ ('1', { 'id': '1', 'title': 'Zurich_Switzerland', 'context': 'Zurich is the largest city in Switzerland with over 400000 ' 'inhabitants. In spite of this, it is not the capital of ' 'Switzerland, which is located in Bern aka Bernie.', 'question': 'What is the capital of Switzerland?', 'answers': { 'answer_start': [1, 20, 29], 'text': ['Zurich', 'Bern', 'Bernie'] } }), ('2', { 'id': '2', 'title': 'Zurich_Switzerland', 'context': 'Switzerland is the country in Euriope with 26 cantons. Zurich ' 'canton has the largest population of 1.5 million.', 'question': 'How many cantons does Switzerland have?', 'answers': { 'answer_start': [8], 'text': ['26'] } }), ('3', { 'id': '3', 'title': 'Paris_France', 'context': 'Paris is the largest city in France with over 2 million ' 'inhabitants. It is the capital of France.', 'question': 'What is the capital of France?', 'answers': { 'answer_start': [1, 7], 'text': ['Paris', 'France'] } }) ])
def _generate_examples(self, squad_data_path, mapping_path=None, qgen_data_path=None): r"""Yields question generation examples. Args: squad_data_path: Path to SQuAD json file. mapping_path: File with SQuAD map id for the example in Zhou et al. splits. qgen_data_path: File with examples in "TokenizedInputSentence\t AnswerStartAndEndPosition\tParsingTreeOfInputSentence\t PoSTagOfInputSentence\tNERTagsOfInputSentence\tTokenizedQuestion\t UntokenizedInputSentence\tAnswerStartCharIndex\tAnswer\t UntokenizedQuestion" format per line, for the Zhou et al. splits. Yields: key and example dict. """ if self.builder_config.name == "split_du": # The file format slightly differs from the original SQuAD JSON format. with tf.io.gfile.GFile(squad_data_path) as f: squad_data = json.load(f) for article in squad_data: for paragraph in article["paragraphs"]: context = paragraph["context"] for qa in paragraph["qas"]: yield qa["id"], { _CONTEXT_PASSAGE: context, _ANSWER: qa["answers"][0]["text"], _QUESTION: qa["question"] } elif self.builder_config.name == "split_zhou": squad_data = {} for k, ex in qa_utils.generate_squadlike_examples(squad_data_path): squad_data[k] = ex with tf.io.gfile.GFile(mapping_path, "r") as mapping_file: with tf.io.gfile.GFile(qgen_data_path, "r") as qgen_data_file: for (ex_id, (squad_id, qgen_data)) in enumerate( zip(mapping_file.read().splitlines(), qgen_data_file.read().splitlines())): (_, _, _, _, _, _, context_sentence, _, answer, question) = qgen_data.split("\t") context_passage = squad_data[squad_id]["context"] yield str(ex_id).zfill(7), { _CONTEXT_SENTENCE: context_sentence, _CONTEXT_PASSAGE: context_passage, _ANSWER: answer, _QUESTION: question }
def _generate_examples(self, filepath): return qa_utils.generate_squadlike_examples(filepath)
def _generate_examples(self, filepath): if self.builder_config.name == "v1.1": return qa_utils.generate_squadlike_examples(filepath) return _generate_v2_examples(filepath)