示例#1
0
def builder_cls(name: str) -> Type[dataset_builder.DatasetBuilder]:
    """Fetches a `tfds.core.DatasetBuilder` class by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).

  Returns:
    A `tfds.core.DatasetBuilder` class.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
    ns_name, builder_name, kwargs = naming.parse_builder_name_kwargs(name)
    if kwargs:
        raise ValueError(
            '`builder_cls` only accept the `dataset_name` without config, '
            f"version or arguments. Got: name='{name}', kwargs={kwargs}")
    if ns_name:
        raise ValueError(
            f'Namespaces not supported for `builder_cls`. Got: {ns_name}')
    # Imported datasets
    try:
        cls = registered.imported_builder_cls(builder_name)
        cls = typing.cast(Type[dataset_builder.DatasetBuilder], cls)
        return cls
    except registered.DatasetNotFoundError as e:
        _reraise_with_list_builders(e,
                                    ns_name=ns_name,
                                    builder_name=builder_name)
示例#2
0
def builder_cls(name: str) -> Type[dataset_builder.DatasetBuilder]:
    """Fetches a `tfds.core.DatasetBuilder` class by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).

  Returns:
    A `tfds.core.DatasetBuilder` class.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
    ds_name, kwargs = naming.parse_builder_name_kwargs(name)
    if kwargs:
        raise ValueError(
            '`builder_cls` only accept the `dataset_name` without config, '
            f"version or arguments. Got: name='{name}', kwargs={kwargs}")
    try:
        if ds_name.namespace:
            # `namespace:dataset` are loaded from the community register
            if visibility.DatasetType.COMMUNITY_PUBLIC.is_available():
                return community.community_register.builder_cls(ds_name)
            else:
                raise ValueError(
                    f'Cannot load {ds_name} when community datasets are disabled'
                )
        else:
            cls = registered.imported_builder_cls(str(ds_name))
            cls = typing.cast(Type[dataset_builder.DatasetBuilder], cls)
        return cls
    except registered.DatasetNotFoundError as e:
        _reraise_with_list_builders(e, name=ds_name)  # pytype: disable=bad-return-type
示例#3
0
 def from_tfds_name(
     cls,
     tfds_name: str,
     split_mapping: Optional[Mapping[str, str]] = None,
 ) -> "DatasetReference":
     """Returns the `DatasetReference` for the given TFDS dataset."""
     parsed_name, builder_kwargs = naming.parse_builder_name_kwargs(
         tfds_name)
     version, config = None, None
     version = builder_kwargs.get("version")
     config = builder_kwargs.get("config")
     return cls(dataset_name=parsed_name.name,
                version=version,
                config=config,
                split_mapping=split_mapping)
示例#4
0
文件: load.py 项目: suvarnak/datasets
def dataset_collection(
    name: str,
    loader_kwargs: Optional[Dict[str, Any]] = None,
) -> DatasetCollectionLoader:
  """Instantiates a DatasetCollectionLoader.

  Args:
    name: The name of the dataset collection to load.
    loader_kwargs: `dict` (optional), keyword arguments to be passed to the
      `tfds.load` function. Refer to `tfds.load` documentation for a
      comperehensive overview of the different loading options.

  Returns:
    A DatasetCollectionLoader object.

  Raises:
    DatasetCollectionNotFoundError if dataset collection not found in registry.
  """
  parsed_name, builder_kwargs = naming.parse_builder_name_kwargs(name)
  if not registered.is_dataset_collection(parsed_name.name):
    available_collections = registered.list_imported_dataset_collections()
    raise registered.DatasetCollectionNotFoundError(
        f'Dataset collection {name} not found. '
        f'Available dataset collections: {available_collections}')

  dataset_collection_cls = registered.imported_dataset_collection_cls(
      parsed_name.name)
  dataset_collection_cls = typing.cast(
      Type[dataset_collection_builder.DatasetCollection],
      dataset_collection_cls)
  collection = dataset_collection_cls()

  requested_version = None
  if 'version' in builder_kwargs:
    requested_version = builder_kwargs['version']

  return DatasetCollectionLoader(
      collection,
      requested_version=requested_version,
      loader_kwargs=loader_kwargs)
示例#5
0
def test_parse_builder_name_kwargs(name, result):
  assert naming.parse_builder_name_kwargs(name) == result
示例#6
0
def _find_builder_dir(name: str, **builder_kwargs: Any) -> Optional[str]:
    """Search whether the given dataset is present on disk and return its path.

  Note:

   * If the dataset is present, but is legacy (no feature config file), None
     is returned.
   * If the config isn't specified, the function tries to infer the default
     config name from the original `DatasetBuilder`.
   * The function searches in all `data_dir` registered with
     `tfds.core.add_data_dir`. If the dataset exists in multiple dirs, an error
     is raised.

  Args:
    name: Builder name (e.g. `my_ds`, `my_ds/config`, `my_ds:1.2.0`,...)
    **builder_kwargs: `tfds.core.DatasetBuilder` kwargs.

  Returns:
    path: The dataset path found, or None if the dataset isn't found.
  """
    # Normalize builder kwargs
    name, builder_kwargs = naming.parse_builder_name_kwargs(
        name, **builder_kwargs)
    version = builder_kwargs.pop('version', None)
    config = builder_kwargs.pop('config', None)
    data_dir = builder_kwargs.pop('data_dir', None)

    # Builder cannot be found if it uses:
    # * namespace
    # * version='experimental_latest'
    # * config objects (rather than `str`)
    # * custom DatasetBuilder.__init__ kwargs
    if (name.namespace or version == 'experimental_latest'
            or isinstance(config, dataset_builder.BuilderConfig)
            or builder_kwargs):
        return None

    # Search the dataset across all registered data_dirs
    all_builder_dirs = []
    for current_data_dir in file_utils.list_data_dirs(given_data_dir=data_dir):
        builder_dir = _find_builder_dir_single_dir(
            name.name,
            data_dir=current_data_dir,
            version_str=str(version) if version else None,
            config_name=config,
        )
        if builder_dir:
            all_builder_dirs.append(builder_dir)
    if not all_builder_dirs:
        return None
    elif len(all_builder_dirs) != 1:
        # Rather than raising error every time, we could potentially be smarter
        # and load the most recent version across all files, but should be
        # carefull when partial version is requested ('my_dataset:3.*.*').
        # Could add some `MultiDataDirManager` API:
        # ```
        # manager = MultiDataDirManager(given_data_dir=data_dir)
        # with manager.merge_data_dirs() as virtual_data_dir:
        #  virtual_builder_dir = _find_builder_dir(name, data_dir=virtual_data_dir)
        #  builder_dir = manager.resolve(virtual_builder_dir)
        # ```
        raise ValueError(
            f'Dataset {name} detected in multiple locations: {all_builder_dirs}. '
            'Please resolve the ambiguity by explicitly setting `data_dir=`.')

    return all_builder_dirs[0]
示例#7
0
def builder(name: str,
            *,
            try_gcs: bool = False,
            **builder_kwargs: Any) -> dataset_builder.DatasetBuilder:
    """Fetches a `tfds.core.DatasetBuilder` by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).
      This can be either `'dataset_name'` or
      `'dataset_name/config_name'` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `'foo_bar/a=True,b=3'` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to
      use the `'zoo'` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.
    **builder_kwargs: `dict` of keyword arguments passed to the
      `tfds.core.DatasetBuilder`.

  Returns:
    A `tfds.core.DatasetBuilder`.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
    # 'kaggle:my_dataset:1.0.0' -> ('kaggle', 'my_dataset', {'version': '1.0.0'})
    ns_name, builder_name, builder_kwargs = naming.parse_builder_name_kwargs(
        name, **builder_kwargs)

    # `try_gcs` currently only support non-community datasets
    if try_gcs and not ns_name and gcs_utils.is_dataset_on_gcs(builder_name):
        data_dir = builder_kwargs.get('data_dir')
        if data_dir:
            raise ValueError(
                f'Cannot have both `try_gcs=True` and `data_dir={data_dir}` '
                'explicitly set')
        builder_kwargs['data_dir'] = gcs_utils.gcs_path('datasets')

    # Community datasets
    if ns_name:
        raise NotImplementedError

    # First check whether code exists or not (imported datasets)
    try:
        cls = builder_cls(builder_name)
    except registered.DatasetNotFoundError as e:
        cls = None  # Class not found
        not_found_error = e  # Save the exception to eventually reraise

    # Eventually try loading from files first
    if _try_load_from_files_first(cls, **builder_kwargs):
        try:
            b = read_only_builder.builder_from_files(builder_name,
                                                     **builder_kwargs)
            return b
        except registered.DatasetNotFoundError as e:
            pass

    # If code exists and loading from files was skipped (e.g. files not found),
    # load from the source code.
    if cls:
        with py_utils.try_reraise(
                prefix=f'Failed to construct dataset {name}: '):
            return cls(**builder_kwargs)  # pytype: disable=not-instantiable

    # If neither the code nor the files are found, raise DatasetNotFoundError
    raise not_found_error