示例#1
0
 def builder(
     self,
     ns_name: Optional[str],
     builder_name: str,
     **builder_kwargs: Any,
 ) -> dataset_builder.DatasetBuilder:
     """Returns the dataset builder."""
     if 'data_dir' in builder_kwargs:
         raise ValueError(
             '`data_dir` cannot be set for data_dir-based community datasets. '
             'Dataset should already be generated.')
     if ns_name is None:
         raise AssertionError(f'No namespace found: {builder_name}')
     if ns_name not in self._ns2data_dir:
         close_matches = difflib.get_close_matches(ns_name,
                                                   self._ns2data_dir,
                                                   n=1)
         hint = f'\nDid you meant: {close_matches[0]}' if close_matches else ''
         raise KeyError(
             f'Namespace `{ns_name}` for `{builder_name}` not found. '
             f'Should be one of {sorted(self._ns2data_dir)}{hint}')
     return read_only_builder.builder_from_files(
         builder_name,
         data_dir=self._ns2data_dir[ns_name],
         **builder_kwargs,
     )
示例#2
0
 def builder(
     self,
     name: utils.DatasetName,
     **builder_kwargs: Any,
 ) -> dataset_builder.DatasetBuilder:
     """Returns the dataset builder."""
     data_dir = builder_kwargs.pop('data_dir', None)
     if data_dir:
         raise ValueError(
             '`data_dir` cannot be set for data_dir-based community datasets. '
             f'Dataset should already be generated. Got: {data_dir}')
     if name.namespace is None:
         raise AssertionError(f'No namespace found: {name}')
     if name.namespace not in self._ns2data_dir:  # pylint: disable=unsupported-membership-test
         close_matches = difflib.get_close_matches(name.namespace,
                                                   self._ns2data_dir,
                                                   n=1)
         hint = f'\nDid you meant: {close_matches[0]}' if close_matches else ''
         raise KeyError(
             f'Namespace `{name.namespace}` for `{name}` not found. '
             f'Should be one of {sorted(self._ns2data_dir)}{hint}')
     return read_only_builder.builder_from_files(
         name.name,
         data_dir=self._ns2data_dir[name.namespace],
         **builder_kwargs,
     )
示例#3
0
def test_builder_from_files_multi_dir(
    code_builder: dataset_builder.DatasetBuilder,
    tmp_path: pathlib.Path,
):
  some_dir = tmp_path / 'other'
  some_dir.mkdir()

  builder = read_only_builder.builder_from_files(
      code_builder.name,
      data_dir=[
          code_builder._data_dir_root, some_dir, '/tmp/non-existing-dir/'
      ],
  )
  assert builder.name == code_builder.name
  assert builder.data_dir == code_builder.data_dir
    def test_metadata(self):
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            builder = RandomShapedImageGenerator(data_dir=tmp_dir)
            builder.download_and_prepare()
            # Metadata should have been created
            self.assertEqual(builder.info.metadata, {"some_key": 123})

            # Metadata should have been restored
            builder2 = RandomShapedImageGenerator(data_dir=tmp_dir)
            self.assertEqual(builder2.info.metadata, {"some_key": 123})

            # Metadata should have been restored even if the builder code was not
            # available and we restored from files.
            builder3 = read_only_builder.builder_from_files(
                builder.name,
                data_dir=tmp_dir,
            )
            self.assertEqual(builder3.info.metadata, {"some_key": 123})
示例#5
0
def builder(name: str,
            *,
            try_gcs: bool = False,
            **builder_kwargs: Any) -> dataset_builder.DatasetBuilder:
    """Fetches a `tfds.core.DatasetBuilder` by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).
      This can be either `'dataset_name'` or
      `'dataset_name/config_name'` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `'foo_bar/a=True,b=3'` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to
      use the `'zoo'` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.
    **builder_kwargs: `dict` of keyword arguments passed to the
      `tfds.core.DatasetBuilder`.

  Returns:
    A `tfds.core.DatasetBuilder`.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
    # 'kaggle:my_dataset:1.0.0' -> ('kaggle', 'my_dataset', {'version': '1.0.0'})
    ns_name, builder_name, builder_kwargs = naming.parse_builder_name_kwargs(
        name, **builder_kwargs)

    # `try_gcs` currently only support non-community datasets
    if try_gcs and not ns_name and gcs_utils.is_dataset_on_gcs(builder_name):
        data_dir = builder_kwargs.get('data_dir')
        if data_dir:
            raise ValueError(
                f'Cannot have both `try_gcs=True` and `data_dir={data_dir}` '
                'explicitly set')
        builder_kwargs['data_dir'] = gcs_utils.gcs_path('datasets')

    # Community datasets
    if ns_name:
        raise NotImplementedError

    # First check whether code exists or not (imported datasets)
    try:
        cls = builder_cls(builder_name)
    except registered.DatasetNotFoundError as e:
        cls = None  # Class not found
        not_found_error = e  # Save the exception to eventually reraise

    # Eventually try loading from files first
    if _try_load_from_files_first(cls, **builder_kwargs):
        try:
            b = read_only_builder.builder_from_files(builder_name,
                                                     **builder_kwargs)
            return b
        except registered.DatasetNotFoundError as e:
            pass

    # If code exists and loading from files was skipped (e.g. files not found),
    # load from the source code.
    if cls:
        with py_utils.try_reraise(
                prefix=f'Failed to construct dataset {name}: '):
            return cls(**builder_kwargs)  # pytype: disable=not-instantiable

    # If neither the code nor the files are found, raise DatasetNotFoundError
    raise not_found_error