Пример #1
0
 def run_with_reraise(fn, k, example_data, tensor_info):
   try:
     return fn(example_data, tensor_info)
   except Exception as e:  # pylint: disable=broad-except
     utils.reraise(
         e, f"Error while serializing feature `{k}`: `{tensor_info}`: ",
     )
Пример #2
0
 def run_with_reraise(fn, k, example_data, tensor_info):
     try:
         return fn(example_data, tensor_info)
     except Exception:  # pylint: disable=broad-except
         utils.reraise(
             "Error while serializing feature `{}`: `{}`: ".format(
                 k, tensor_info))
Пример #3
0
    def set_file_format(
        self,
        file_format: Union[None, str, file_adapters.FileFormat],
    ) -> None:
        """Internal function to define the file format.

    The file format is set during `FileReaderBuilder.__init__`,
    not `DatasetInfo.__init__`.

    Args:
      file_format: The file format.
    """
        # If file format isn't present already, fallback to `DEFAULT_FILE_FORMAT`
        file_format = (
            file_format  # Format explicitly given: tfds.builder(..., file_format=x)
            or self.file_format  # Format restored from dataset_info.json
            or file_adapters.DEFAULT_FILE_FORMAT)
        try:
            new_file_format = file_adapters.FileFormat(file_format)
        except ValueError as e:
            all_values = [f.value for f in file_adapters.FileFormat]
            utils.reraise(e, suffix=f". Valid file formats: {all_values}")

        # If the file format has been set once, file format should be consistent
        if self.file_format and self.file_format != new_file_format:
            raise ValueError(
                f"File format is already set to {self.file_format}. "
                f"Got {new_file_format}")
        self.as_proto.file_format = new_file_format.value
Пример #4
0
    def _build_from_generator(
        self,
        split_name: str,
        generator: Iterable[KeyExample],
        path: type_utils.PathLike,
    ) -> _SplitInfoFuture:
        """Split generator for example generators.

    Args:
      split_name: str,
      generator: Iterable[KeyExample],
      path: type_utils.PathLike,

    Returns:
      future: The future containing the `tfds.core.SplitInfo`.
    """
        if self._max_examples_per_split is not None:
            logging.warning('Splits capped at %s examples max.',
                            self._max_examples_per_split)
            generator = itertools.islice(generator,
                                         self._max_examples_per_split)
            total_num_examples = self._max_examples_per_split
        else:
            # If dataset info has been pre-downloaded from the internet,
            # we can use the pre-computed number of example for the progression bar.
            split_info = self._split_dict.get(split_name)
            if split_info and split_info.num_examples:
                total_num_examples = split_info.num_examples
            else:
                total_num_examples = None

        writer = tfrecords_writer.Writer(
            example_specs=self._features.get_serialized_info(),
            path=path,
            hash_salt=split_name,
            file_format=self._file_format,
        )
        for key, example in utils.tqdm(
                generator,
                desc=f'Generating {split_name} examples...',
                unit=' examples',
                total=total_num_examples,
                leave=False,
        ):
            try:
                example = self._features.encode_example(example)
            except Exception as e:  # pylint: disable=broad-except
                utils.reraise(e,
                              prefix=f'Failed to encode example:\n{example}\n')
            writer.write(key, example)
        shard_lengths, total_size = writer.finalize()

        split_info = splits_lib.SplitInfo(
            name=split_name,
            shard_lengths=shard_lengths,
            num_bytes=total_size,
        )
        return _SplitInfoFuture(lambda: split_info)
Пример #5
0
 def encode_example(self, example_dict):
   """See base class for details."""
   example = {}
   for k, (feature, example_value) in utils.zip_dict(self._feature_dict,
                                                     example_dict):
     try:
       example[k] = feature.encode_example(example_value)
     except Exception as e:  # pylint: disable=broad-except
       utils.reraise(e, prefix=f'In <{feature.__class__.__name__}>'
                     + f' with name "{k}":\n')
   return example
Пример #6
0
 def encode_example(self, audio_or_path_or_fobj):
     if isinstance(audio_or_path_or_fobj, (np.ndarray, list)):
         return audio_or_path_or_fobj
     elif isinstance(audio_or_path_or_fobj, epath.PathLikeCls):
         filename = os.fspath(audio_or_path_or_fobj)
         file_format = self._file_format or filename.split('.')[-1]
         with tf.io.gfile.GFile(filename, 'rb') as audio_f:
             try:
                 return self._encode_file(audio_f, file_format)
             except Exception as e:  # pylint: disable=broad-except
                 utils.reraise(e, prefix=f'Error for {filename}: ')
     else:
         return self._encode_file(audio_or_path_or_fobj, self._file_format)
Пример #7
0
def reraise_with_context(error_cls: Type[Exception]) -> Iterator[None]:
    """Contextmanager which reraises an exception with an additional message.

  Args:
    error_cls: The exception to be reraised.

  Yields:
    None.
  """
    # If current_context_msg exists, we are already within the scope of the
    # session contextmanager.
    if hasattr(context_holder, 'current_context_msg'):
        yield
        return

    context_holder.current_context_msg = ErrorContext()
    try:
        yield
    except error_cls as e:
        context_msg = '\n'.join(context_holder.current_context_msg.messages)
        utils.reraise(e, suffix=context_msg)
    finally:
        del context_holder.current_context_msg
Пример #8
0
  def _build_from_generator(
      self,
      split_name: str,
      generator: Iterable[KeyExample],
      filename_template: naming.ShardedFileTemplate,
      disable_shuffling: bool,
  ) -> _SplitInfoFuture:
    """Split generator for example generators.

    Args:
      split_name: str,
      generator: Iterable[KeyExample],
      filename_template: Template to format the filename for a shard.
      disable_shuffling: Specifies whether to shuffle the examples,

    Returns:
      future: The future containing the `tfds.core.SplitInfo`.
    """
    if self._max_examples_per_split is not None:
      logging.warning('Splits capped at %s examples max.',
                      self._max_examples_per_split)
      generator = itertools.islice(generator, self._max_examples_per_split)
      total_num_examples = self._max_examples_per_split
    else:
      # If dataset info has been pre-downloaded from the internet,
      # we can use the pre-computed number of example for the progression bar.
      split_info = self._split_dict.get(split_name)
      if split_info and split_info.num_examples:
        total_num_examples = split_info.num_examples
      else:
        total_num_examples = None

    writer = writer_lib.Writer(
        serializer=example_serializer.ExampleSerializer(
            self._features.get_serialized_info()),
        filename_template=filename_template,
        hash_salt=split_name,
        disable_shuffling=disable_shuffling,
        # TODO(weide) remove this because it's already in filename_template?
        file_format=self._file_format,
    )
    for key, example in utils.tqdm(
        generator,
        desc=f'Generating {split_name} examples...',
        unit=' examples',
        total=total_num_examples,
        leave=False,
    ):
      try:
        example = self._features.encode_example(example)
      except Exception as e:  # pylint: disable=broad-except
        utils.reraise(e, prefix=f'Failed to encode example:\n{example}\n')
      writer.write(key, example)
    shard_lengths, total_size = writer.finalize()

    split_info = splits_lib.SplitInfo(
        name=split_name,
        shard_lengths=shard_lengths,
        num_bytes=total_size,
        filename_template=filename_template,
    )
    return _SplitInfoFuture(lambda: split_info)