예제 #1
0
    def process(self, entry, errors, debug_info=None):
        """Converts TyDi entries into serialized tf examples.

    Args:
      entry: "TyDi entries", dicts as returned by `create_entry_from_json`.
      errors: A list that this function appends to if errors are created. A
        non-empty list indicates problems.
      debug_info: A dict of information that may be useful during debugging.
        These elements should be used for logging and debugging only. For
        example, we log how the text was tokenized into WordPieces.

    Yields:
      `tf.train.Example` with the features needed for training or inference
      (depending on how `is_training` was set in the constructor).
    """
        if not debug_info:
            debug_info = {}
        # convert raw data into TyDiExample
        tydi_example = data.to_tydi_example(entry, self.is_training)
        debug_info["tydi_example"] = tydi_example

        # Converts `TyDiExample`s into `InputFeatures` and sends them to `output_fn
        input_features = preprocess.convert_single_example(
            tydi_example,
            tokenizer=self.tokenizer,
            is_training=self.is_training,
            max_question_length=self.max_question_length,
            max_seq_length=self.max_seq_length,
            doc_stride=self.doc_stride,
            include_unknowns=self.include_unknowns,
            errors=errors,
            debug_info=debug_info)
        # convert example to features
        for input_feature in input_features:
            input_feature.example_index = int(entry["id"])
            input_feature.unique_id = (input_feature.example_index +
                                       input_feature.doc_span_index)

            features = collections.OrderedDict()
            features["unique_ids"] = [input_feature.unique_id]
            features["example_index"] = [input_feature.example_index]
            features["input_ids"] = input_feature.input_ids
            features["input_mask"] = input_feature.input_mask
            features["segment_ids"] = input_feature.segment_ids
            features["language_id"] = [input_feature.language_id]

            if self.is_training:
                features["start_positions"] = [input_feature.start_position]
                features["end_positions"] = [input_feature.end_position]
                features["answer_types"] = [input_feature.answer_type]
            else:
                features["wp_start_offset"] = input_feature.wp_start_offset
                features["wp_end_offset"] = input_feature.wp_end_offset

            self.feature_lst.append(features)

        return self.feature_lst
예제 #2
0
def read_tydi_examples(input_file, is_training, max_passages, max_position,
                       fail_on_invalid, open_fn):
  """Read a TyDi json file into a list of `TyDiExample`.

  Delegates to `preproc.create_entry_from_json` to add special tokens to
  input and handle character offset tracking.

  Args:
    input_file: Path or glob to input JSONL files to be read (possibly gzipped).
    is_training: Should we create training samples? (as opposed to eval
      samples).
    max_passages: See FLAGS.max_passages.
    max_position: See FLAGS.max_position.
    fail_on_invalid: Should we immediately stop processing if an error is
      encountered?
    open_fn: A function that returns a file object given a path. Usually
      `tf_io.gopen`; could be standard Python `open` if using this module
      outside Tensorflow.

  Yields:
    `TyDiExample`s
  """
  input_paths = glob.glob(input_file)
  if not input_paths:
    raise ValueError("No paths matching glob '{}'".format(input_file))

  non_valid_count = 0
  n = 0
  for path in input_paths:
    logging.info("Reading: %s", path)
    with open_fn(path) as input_file:
      logging.info(path)
      for line in input_file:
        json_dict = json.loads(line, object_pairs_hook=collections.OrderedDict)
        entry = create_entry_from_json(
            json_dict,
            max_passages=max_passages,
            max_position=max_position,
            fail_on_invalid=fail_on_invalid)
        if entry:
          tydi_example = data.to_tydi_example(entry, is_training)
          n += 1
          yield tydi_example
        else:
          if fail_on_invalid:
            raise ValueError("Found invalid example.")
          non_valid_count += 1

  if n == 0:
    raise ValueError(
        "No surviving examples from input_file '{}'".format(input_file))

  logging.info("*** # surviving examples %d ***", n)
  logging.info("*** # pruned examples %d ***", non_valid_count)
def read_tydi_examples(input_file,
                       is_training=False,
                       max_passages=45,
                       max_position=45):
    input_paths = glob.glob(input_file)

    n = 0
    for path in input_paths:
        logging.info("Reading: %s", path)
        with open(path) as input_file:
            logging.info(path)
            for line in input_file:
                json_dict = json.loads(
                    line, object_pairs_hook=collections.OrderedDict)
                entry = create_entry_from_json(
                    json_dict,
                    max_passages=max_passages,
                    max_position=max_position
                    # fail_on_invalid=fail_on_invalid
                )
                if entry:
                    tydi_example = data.to_tydi_example(entry, is_training)
                    n += 1
                    yield tydi_example