Exemplo n.º 1
0
def test_builder_code_not_found(code_builder: dataset_builder.DatasetBuilder):
  """If the code isn't found, use files instead."""

  # Patch `tfds.builder_cls` to emulate that the dataset isn't registered
  with mock.patch.object(
      load,
      'builder_cls',
      side_effect=registered.DatasetNotFoundError(code_builder.name),
  ):
    # Files exists, but not code, loading from files
    builder = load.builder(code_builder.name)
    assert isinstance(builder, read_only_builder.ReadOnlyBuilder)
    load.load(code_builder.name, split=[])  # Dataset found -> no error

    if code_builder.builder_config:
      # When the code isn't found, default config is infered from `.config/`
      assert builder.builder_config.name == code_builder.BUILDER_CONFIGS[0].name

      # Explicitly passing a config should works too.
      config_name = f'{code_builder.name}/{code_builder.builder_config.name}'
      builder = load.builder(config_name)
      assert isinstance(builder, read_only_builder.ReadOnlyBuilder)

    # Neither code not files found, raise DatasetNotFoundError
    with pytest.raises(registered.DatasetNotFoundError):
      load.builder(code_builder.name, data_dir='/tmp/non-existing/tfds/dir')

    with pytest.raises(registered.DatasetNotFoundError):
      load.load(
          code_builder.name, split=[], data_dir='/tmp/non-existing/tfds/dir'
      )
Exemplo n.º 2
0
  def builder_cls(
      self,
      name: naming.DatasetName,
  ) -> Type[dataset_builder.DatasetBuilder]:
    """Loads the builder class for the given dataset.

    Arguments:
      name: the name and namespace of the dataset to load the builder class for.

    Returns:
      DatasetNotFoundError if data is not found.
    """
    registers = self._get_registers(name)

    # Typically there's only 1, so add special case so that more informative
    # exceptions are raised.
    if len(registers) == 1:
      return registers[0].builder_cls(name)

    # If this dataset has multiple registers, use the first that can be found.
    for register in registers:
      try:
        return register.builder_cls(name)
      except registered.DatasetNotFoundError:
        pass

    raise registered.DatasetNotFoundError(
        f'Namespace {name.namespace} found, '
        f'but could not load dataset {name.name}.'
        f'{self._get_list_builders_context(name)}')
def test_builder_code_not_found(code_builder: dataset_builder.DatasetBuilder):
    """If the code isn't found, use files instead."""

    # Patch `tfds.builder_cls` to emulate that the dataset isn't registered
    with mock.patch.object(
            load,
            'builder_cls',
            side_effect=registered.DatasetNotFoundError(code_builder.name),
    ):
        # When the code isn't found, loading dataset require explicit config name:
        # tfds.builder('ds/config')
        config_name = code_builder.name
        if code_builder.builder_config:
            config_name = f'{config_name}/{code_builder.builder_config.name}'

        # Files exists, but not code, loading from files
        builder = load.builder(config_name)
        assert isinstance(builder, read_only_builder.ReadOnlyBuilder)
        load.load(config_name, split=[])  # Dataset found -> no error

        # Neither code not files found, raise DatasetNotFoundError
        with pytest.raises(registered.DatasetNotFoundError):
            load.builder(config_name, data_dir='/tmp/non-existing/tfds/dir')

        with pytest.raises(registered.DatasetNotFoundError):
            load.load(config_name,
                      split=[],
                      data_dir='/tmp/non-existing/tfds/dir')
Exemplo n.º 4
0
 def builder_cls(
     self,
     name: utils.DatasetName,
 ) -> Type[dataset_builder.DatasetBuilder]:
     """Returns the builder classes."""
     if name.namespace not in self.namespaces:  # pylint: disable=unsupported-membership-test
         raise registered.DatasetNotFoundError(
             f'Namespace {name.namespace} not found. Should be one of: '
             f'{sorted(self.namespaces)}')
     raise NotImplementedError(
         'builder_cls does not support data_dir-based community datasets. Got: '
         f'{name}')
Exemplo n.º 5
0
def _download_or_reuse_cache(
    name: utils.DatasetName,
    package_index: _PackageIndex,
) -> _InstalledPackage:
  """Downloads the dataset generation source code.

  Search the dataset in the cache, or download it from the package index
  otherwise.

  Args:
    name: Dataset name to load.
    package_index: Index of all community datasets. Might be updated.

  Returns:
    The installed dataset information.

  Raises:
    DatasetNotFoundError: If the dataset can't be loaded.
  """
  # Dataset can be:
  # * Installed locally (in the cache) -> reuse
  # * Not installed but present in the package index -> install
  # * Not present in the package index -> raise error

  # Check if the file is already downloaded/cached
  # TODO(tfds): To force a download even if file already present, we
  # should add a `ignore_cache=True` option in `tfds.load`. Or should always
  # try to download the file ?
  last_installed_version = _get_last_installed_version(name)
  if last_installed_version:
    return last_installed_version

  # If file isn't cached yet, we need to download it.
  # First need to find it's location.
  if name not in package_index:
    # If not, we need to update the package index cache
    package_index.refresh()
  # If the dataset is present in the package index cache, use this
  package = package_index.get(name)
  if not package:
    # If still not found, raise an DatasetNotFoundError
    raise registered.DatasetNotFoundError(
        f'Could not find dataset {name}: Dataset not found among the '
        f'{len(package_index)} datasets of the community index.'
    )

  # If package was found, download it.
  installed_package = _download_and_cache(package)
  return installed_package
Exemplo n.º 6
0
  def builder(
      self,
      name: naming.DatasetName,
      **builder_kwargs: Any,
  ) -> dataset_builder.DatasetBuilder:
    """Loads the builder class for the given dataset."""
    registers = self._get_registers(name)

    # Typically there's only 1, so add special case so that more informative
    # exceptions are raised.
    if len(registers) == 1:
      return registers[0].builder(name, **builder_kwargs)

    if len(registers) > 1:
      raise ValueError(f'Namespace {name.namespace} has multiple registers! '
                       f'This should not happen! Registers: {registers}')

    raise registered.DatasetNotFoundError(
        f'Namespace {name.namespace} found with {len(registers)} registers, '
        f'but could not load dataset {name.name}.')
Exemplo n.º 7
0
  def _get_registers(
      self, name: naming.DatasetName) -> List[register_base.BaseRegister]:
    """Returns all available registers for a given namespace, if any.

    Args:
      name: str, the namespace's name.

    Raises:
      DatasetNotFound error if the namespace is not found.
    """
    if not self.has_namespace(name.namespace):
      error_msg = (f'\nNamespace {name.namespace} not found. ')
      error_msg += (f'Note that the namespace should be one of: '
                    f'{sorted(self.registers_per_namespace.keys())}.\n')
      close_matches = difflib.get_close_matches(
          name.namespace, self.registers_per_namespace, n=1)
      if close_matches:
        error_msg += f'Did you mean: {name.namespace} -> {close_matches[0]} ?\n'
      raise registered.DatasetNotFoundError(error_msg)
    return self.registers_per_namespace[name.namespace]
def builder_from_files(
    name: str,
    **builder_kwargs: Any,
) -> dataset_builder.DatasetBuilder:
    """Loads a `tfds.core.DatasetBuilder` from files, auto-infering location.

  This function is similar to `tfds.builder` (same signature), but create
  the `tfds.core.DatasetBuilder` directly from files, without loading
  original generation source code.

  It does not supports:

   * namespaces (e.g. 'kaggle:dataset')
   * config objects (`dataset/config` valid, but not `config=MyConfig()`)
   * `version='experimental_latest'`

  Args:
    name: Dataset name.
    **builder_kwargs: `tfds.core.DatasetBuilder` kwargs.

  Returns:
    builder: The loaded dataset builder.

  Raises:
    DatasetNotFoundError: If the dataset cannot be loaded.
  """
    # Find and load dataset builder.
    builder_dir = _find_builder_dir(name, **builder_kwargs)
    if builder_dir is not None:  # A generated dataset was found on disk
        return builder_from_directory(builder_dir)
    else:
        data_dirs = constants.list_data_dirs(
            given_data_dir=builder_kwargs.get('data_dir'))
        raise registered.DatasetNotFoundError(
            f'Could not find dataset files for: {name}. Make sure the dataset '
            f'has been generated in: {data_dirs}. If the dataset has configs, you '
            'might have to specify the config name.')