def test_create_entry_from_json_min_answer(self): json_dict = _JSON_MIN_ANSWER result = preproc.create_entry_from_json(json_dict, max_passages=45, max_position=45, tokenizer=make_tokenizer(), fail_on_invalid=True) # Checks that passage markers generated by TyDiTokenizer.get_passage_marker # are inserted by preproc.create_entry_from_json. self.assertCreateEntryFromJsonResult( json_dict=json_dict, result=result, expected_context= ("\ue006 The zebra finch is the most common estrildid finch. " "The bird has been introduced to Puerto Rico. " "\ue007 The body temperature (as measured from the cloaca) " "of the zebra finch may vary from 38 to 44 °C. " "\ue008 The zebra finch was first collected in 1801 during " "Nicolas Baudin's expedition to Australia. It was described in " "1817 by Louis Jean Pierre Vieillot in his Nouveau Dictionnaire " "d'Histoire Naturelle. " "\ue009 Morphological differences between the subspecies. " "Males do not have the fine barring found on the throat and upper " "breast. " "\ue00a Symmetry of both plumage, like chest bands, and " "artificial features, like leg bands, are preferred by the female. " "\ue00b Nest predators of the zebra finch include the tiger " "snake."), expected_answer_type="minimal", expected_passage_answer_index=3, expected_min_span_start=507, expected_min_span_end=530, expected_min_span_text="throat and upper breast")
def read_entries(input_jsonl_pattern, tokenizer, max_passages, max_position, fail_on_invalid): """Reads TyDi QA examples from JSONL files. Args: input_jsonl_pattern: Glob of the gzipped JSONL files to read. tokenizer: Used to create special marker symbols to insert into the text. max_passages: see FLAGS.max_passages. max_position: see FLAGS.max_position. fail_on_invalid: Immediately stop if an error is found? Yields: tuple: input_file: str line_no: int tydi_entry: "TyDiEntry"s, dicts as returned by `create_entry_from_json`, one per line of the input JSONL files. debug_info: Dict containing debugging data. """ matches = tf.gfile.Glob(input_jsonl_pattern) if not matches: raise ValueError(f"No files matched: {input_jsonl_pattern}") for input_path in matches: with gzip.GzipFile(fileobj=tf.gfile.Open(input_path, "rb")) as input_file: # pytype: disable=wrong-arg-types for line_no, line in enumerate(input_file, 1): json_elem = json.loads(line, object_pairs_hook=collections.OrderedDict) entry = preproc.create_entry_from_json( json_elem, tokenizer, max_passages=max_passages, max_position=max_position, fail_on_invalid=fail_on_invalid) if not entry: tf.logging.info("Invalid Example %d", json_elem["example_id"]) if fail_on_invalid: raise ValueError("Invalid example at {}:{}".format( input_path, line_no)) # Return a `debug_info` dict that methods throughout the codebase # append to with debugging information. debug_info = {"json": json_elem} yield input_path, line_no, entry, debug_info
def test_create_entry_from_json_no_answer(self): json_dict = _JSON_NO_ANSWER result = preproc.create_entry_from_json(json_dict, max_passages=45, max_position=45, tokenizer=make_tokenizer(), fail_on_invalid=True) # Checks that passage markers generated by TyDiTokenizer.get_passage_marker # are inserted by preproc.create_entry_from_json. self.assertCreateEntryFromJsonResult( json_dict=json_dict, result=result, expected_context=( "\ue006 The zebra finch is the most common estrildid finch. " "\ue007 The body temperature may vary from 38 to 44 °C."), expected_answer_type="passage", expected_passage_answer_index=-1, expected_min_span_start=-1, expected_min_span_end=-1, expected_min_span_text="")