Пример #1
0
  def _build_data_dir(self, given_data_dir):
    """Return the data directory for the current version.

    Args:
      given_data_dir: `Optional[str]`, root `data_dir` passed as
        `__init__` argument.

    Returns:
      data_dir_root: `str`, The root dir containing all datasets, downloads,...
      data_dir: `str`, The version data_dir
        (e.g. `<data_dir_root>/<ds_name>/<config>/<version>`)
    """
    builder_dir = self._relative_data_dir(with_version=False)
    version_dir = self._relative_data_dir(with_version=True)

    default_data_dir = constants.get_default_data_dir(
        given_data_dir=given_data_dir
    )
    all_data_dirs = constants.list_data_dirs(given_data_dir=given_data_dir)

    all_versions = set()
    requested_version_dirs = {}
    for data_dir_root in all_data_dirs:
      # List all existing versions
      full_builder_dir = os.path.join(data_dir_root, builder_dir)
      data_dir_versions = set(utils.version.list_all_versions(full_builder_dir))
      # Check for existance of the requested version
      if self.version in data_dir_versions:
        requested_version_dirs[data_dir_root] = os.path.join(
            data_dir_root, version_dir
        )
      all_versions.update(data_dir_versions)

    if len(requested_version_dirs) > 1:
      raise ValueError(
          "Dataset was found in more than one directory: {}. Please resolve "
          "the ambiguity by explicitly specifying `data_dir=`."
          "".format(requested_version_dirs.values())
      )
    elif len(requested_version_dirs) == 1:  # The dataset is found once
      return next(iter(requested_version_dirs.items()))

    # No dataset found, use default directory
    data_dir = os.path.join(default_data_dir, version_dir)
    if all_versions:
      logging.warning(
          "Found a different version of the requested dataset:\n"
          "%s\n"
          "Using %s instead.",
          "\n".join(str(v) for v in sorted(all_versions)),
          data_dir
      )
    return default_data_dir, data_dir
Пример #2
0
def find_builder_dir(
    name: str,
    *,
    data_dir: Optional[str] = None,
) -> Optional[str]:
  """Search whether the given dataset is present on disk and return its path.

  Note:

   * If the dataset is present, but is legacy (no feature config file), None
     is returned.
   * If the config isn't specified, the function try to infer the default
     config name from the original `DatasetBuilder`.
   * The function searches in all `data_dir` registered with
     `tfds.core.add_data_dir`. If the dataset exists in multiple dirs, an error
     is raised.

  Args:
    name: Builder name (e.g. `my_ds`, `my_ds/config`, `my_ds:1.2.0`,...)
    data_dir: Path where to search for the dataset
      (e.g. `~/tensorflow_datasets`).

  Returns:
    path: The dataset path found, or None if the dataset isn't found.
  """
  # Search the dataset across all registered data_dirs
  all_builder_dirs = []
  for current_data_dir in constants.list_data_dirs(given_data_dir=data_dir):
    builder_dir = _find_builder_dir_single_dir(
        name, data_dir=current_data_dir
    )
    if builder_dir:
      all_builder_dirs.append(builder_dir)
  if not all_builder_dirs:
    return None
  elif len(all_builder_dirs) != 1:
    # Rather than raising error every time, we could potentially be smarter
    # and load the most recent version across all files, but should be
    # carefull when partial version is requested ('my_dataset:3.*.*').
    # Could add some `MultiDataDirManager` API:
    # ```
    # manager = MultiDataDirManager(given_data_dir=data_dir)
    # with manager.merge_data_dirs() as virtual_data_dir:
    #  virtual_builder_dir = _find_builder_dir(name, data_dir=virtual_data_dir)
    #  builder_dir = manager.resolve(virtual_builder_dir)
    # ```
    raise ValueError(
        f"Dataset {name} detected in multiple locations: {all_builder_dirs}. "
        "Please resolve the ambiguity by explicitly setting `data_dir=`."
    )
  else:
    return next(iter(all_builder_dirs))  # List has a single element
def builder_from_files(
    name: str,
    **builder_kwargs: Any,
) -> dataset_builder.DatasetBuilder:
    """Loads a `tfds.core.DatasetBuilder` from files, auto-infering location.

  This function is similar to `tfds.builder` (same signature), but create
  the `tfds.core.DatasetBuilder` directly from files, without loading
  original generation source code.

  It does not supports:

   * namespaces (e.g. 'kaggle:dataset')
   * config objects (`dataset/config` valid, but not `config=MyConfig()`)
   * `version='experimental_latest'`

  Args:
    name: Dataset name.
    **builder_kwargs: `tfds.core.DatasetBuilder` kwargs.

  Returns:
    builder: The loaded dataset builder.

  Raises:
    DatasetNotFoundError: If the dataset cannot be loaded.
  """
    # Find and load dataset builder.
    builder_dir = _find_builder_dir(name, **builder_kwargs)
    if builder_dir is not None:  # A generated dataset was found on disk
        return builder_from_directory(builder_dir)
    else:
        data_dirs = constants.list_data_dirs(
            given_data_dir=builder_kwargs.get('data_dir'))
        raise registered.DatasetNotFoundError(
            f'Could not find dataset files for: {name}. Make sure the dataset '
            f'has been generated in: {data_dirs}. If the dataset has configs, you '
            'might have to specify the config name.')
Пример #4
0
def _find_builder_dir(name: str, **builder_kwargs: Any) -> Optional[str]:
    """Search whether the given dataset is present on disk and return its path.

  Note:

   * If the dataset is present, but is legacy (no feature config file), None
     is returned.
   * If the config isn't specified, the function try to infer the default
     config name from the original `DatasetBuilder`.
   * The function searches in all `data_dir` registered with
     `tfds.core.add_data_dir`. If the dataset exists in multiple dirs, an error
     is raised.

  Args:
    name: Builder name (e.g. `my_ds`, `my_ds/config`, `my_ds:1.2.0`,...)
    **builder_kwargs: `tfds.core.DatasetBuilder` kwargs.

  Returns:
    path: The dataset path found, or None if the dataset isn't found.
  """
    # Normalize builder kwargs
    ns_name, ds_name, builder_kwargs = naming.parse_builder_name_kwargs(
        name, **builder_kwargs)
    version = builder_kwargs.pop('version', None)
    config = builder_kwargs.pop('config', None)
    data_dir = builder_kwargs.pop('data_dir', None)

    # Builder cannot be found if it uses:
    # * namespace
    # * version='experimental_latest'
    # * config objects (rather than `str`)
    # * custom DatasetBuilder.__init__ kwargs
    if (ns_name or version == 'experimental_latest'
            or isinstance(config, dataset_builder.BuilderConfig)
            or builder_kwargs):
        return None

    # Search the dataset across all registered data_dirs
    all_builder_dirs = []
    for current_data_dir in constants.list_data_dirs(given_data_dir=data_dir):
        builder_dir = _find_builder_dir_single_dir(
            ds_name,
            data_dir=current_data_dir,
            version_str=str(version) if version else None,
            config_name=config,
        )
        if builder_dir:
            all_builder_dirs.append(builder_dir)
    if not all_builder_dirs:
        return None
    elif len(all_builder_dirs) != 1:
        # Rather than raising error every time, we could potentially be smarter
        # and load the most recent version across all files, but should be
        # carefull when partial version is requested ('my_dataset:3.*.*').
        # Could add some `MultiDataDirManager` API:
        # ```
        # manager = MultiDataDirManager(given_data_dir=data_dir)
        # with manager.merge_data_dirs() as virtual_data_dir:
        #  virtual_builder_dir = _find_builder_dir(name, data_dir=virtual_data_dir)
        #  builder_dir = manager.resolve(virtual_builder_dir)
        # ```
        raise ValueError(
            f'Dataset {name} detected in multiple locations: {all_builder_dirs}. '
            'Please resolve the ambiguity by explicitly setting `data_dir=`.')
    else:
        return next(iter(all_builder_dirs))  # List has a single element