def builder_cls(name: str) -> Type[dataset_builder.DatasetBuilder]: """Fetches a `tfds.core.DatasetBuilder` class by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). Returns: A `tfds.core.DatasetBuilder` class. Raises: DatasetNotFoundError: if `name` is unrecognized. """ ns_name, builder_name, kwargs = naming.parse_builder_name_kwargs(name) if kwargs: raise ValueError( '`builder_cls` only accept the `dataset_name` without config, ' f"version or arguments. Got: name='{name}', kwargs={kwargs}") if ns_name: raise ValueError( f'Namespaces not supported for `builder_cls`. Got: {ns_name}') # Imported datasets try: cls = registered.imported_builder_cls(builder_name) cls = typing.cast(Type[dataset_builder.DatasetBuilder], cls) return cls except registered.DatasetNotFoundError as e: _reraise_with_list_builders(e, ns_name=ns_name, builder_name=builder_name)
def builder_cls(name: str) -> Type[dataset_builder.DatasetBuilder]: """Fetches a `tfds.core.DatasetBuilder` class by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). Returns: A `tfds.core.DatasetBuilder` class. Raises: DatasetNotFoundError: if `name` is unrecognized. """ ds_name, kwargs = naming.parse_builder_name_kwargs(name) if kwargs: raise ValueError( '`builder_cls` only accept the `dataset_name` without config, ' f"version or arguments. Got: name='{name}', kwargs={kwargs}") try: if ds_name.namespace: # `namespace:dataset` are loaded from the community register if visibility.DatasetType.COMMUNITY_PUBLIC.is_available(): return community.community_register.builder_cls(ds_name) else: raise ValueError( f'Cannot load {ds_name} when community datasets are disabled' ) else: cls = registered.imported_builder_cls(str(ds_name)) cls = typing.cast(Type[dataset_builder.DatasetBuilder], cls) return cls except registered.DatasetNotFoundError as e: _reraise_with_list_builders(e, name=ds_name) # pytype: disable=bad-return-type
def from_tfds_name( cls, tfds_name: str, split_mapping: Optional[Mapping[str, str]] = None, ) -> "DatasetReference": """Returns the `DatasetReference` for the given TFDS dataset.""" parsed_name, builder_kwargs = naming.parse_builder_name_kwargs( tfds_name) version, config = None, None version = builder_kwargs.get("version") config = builder_kwargs.get("config") return cls(dataset_name=parsed_name.name, version=version, config=config, split_mapping=split_mapping)
def dataset_collection( name: str, loader_kwargs: Optional[Dict[str, Any]] = None, ) -> DatasetCollectionLoader: """Instantiates a DatasetCollectionLoader. Args: name: The name of the dataset collection to load. loader_kwargs: `dict` (optional), keyword arguments to be passed to the `tfds.load` function. Refer to `tfds.load` documentation for a comperehensive overview of the different loading options. Returns: A DatasetCollectionLoader object. Raises: DatasetCollectionNotFoundError if dataset collection not found in registry. """ parsed_name, builder_kwargs = naming.parse_builder_name_kwargs(name) if not registered.is_dataset_collection(parsed_name.name): available_collections = registered.list_imported_dataset_collections() raise registered.DatasetCollectionNotFoundError( f'Dataset collection {name} not found. ' f'Available dataset collections: {available_collections}') dataset_collection_cls = registered.imported_dataset_collection_cls( parsed_name.name) dataset_collection_cls = typing.cast( Type[dataset_collection_builder.DatasetCollection], dataset_collection_cls) collection = dataset_collection_cls() requested_version = None if 'version' in builder_kwargs: requested_version = builder_kwargs['version'] return DatasetCollectionLoader( collection, requested_version=requested_version, loader_kwargs=loader_kwargs)
def test_parse_builder_name_kwargs(name, result): assert naming.parse_builder_name_kwargs(name) == result
def _find_builder_dir(name: str, **builder_kwargs: Any) -> Optional[str]: """Search whether the given dataset is present on disk and return its path. Note: * If the dataset is present, but is legacy (no feature config file), None is returned. * If the config isn't specified, the function tries to infer the default config name from the original `DatasetBuilder`. * The function searches in all `data_dir` registered with `tfds.core.add_data_dir`. If the dataset exists in multiple dirs, an error is raised. Args: name: Builder name (e.g. `my_ds`, `my_ds/config`, `my_ds:1.2.0`,...) **builder_kwargs: `tfds.core.DatasetBuilder` kwargs. Returns: path: The dataset path found, or None if the dataset isn't found. """ # Normalize builder kwargs name, builder_kwargs = naming.parse_builder_name_kwargs( name, **builder_kwargs) version = builder_kwargs.pop('version', None) config = builder_kwargs.pop('config', None) data_dir = builder_kwargs.pop('data_dir', None) # Builder cannot be found if it uses: # * namespace # * version='experimental_latest' # * config objects (rather than `str`) # * custom DatasetBuilder.__init__ kwargs if (name.namespace or version == 'experimental_latest' or isinstance(config, dataset_builder.BuilderConfig) or builder_kwargs): return None # Search the dataset across all registered data_dirs all_builder_dirs = [] for current_data_dir in file_utils.list_data_dirs(given_data_dir=data_dir): builder_dir = _find_builder_dir_single_dir( name.name, data_dir=current_data_dir, version_str=str(version) if version else None, config_name=config, ) if builder_dir: all_builder_dirs.append(builder_dir) if not all_builder_dirs: return None elif len(all_builder_dirs) != 1: # Rather than raising error every time, we could potentially be smarter # and load the most recent version across all files, but should be # carefull when partial version is requested ('my_dataset:3.*.*'). # Could add some `MultiDataDirManager` API: # ``` # manager = MultiDataDirManager(given_data_dir=data_dir) # with manager.merge_data_dirs() as virtual_data_dir: # virtual_builder_dir = _find_builder_dir(name, data_dir=virtual_data_dir) # builder_dir = manager.resolve(virtual_builder_dir) # ``` raise ValueError( f'Dataset {name} detected in multiple locations: {all_builder_dirs}. ' 'Please resolve the ambiguity by explicitly setting `data_dir=`.') return all_builder_dirs[0]
def builder(name: str, *, try_gcs: bool = False, **builder_kwargs: Any) -> dataset_builder.DatasetBuilder: """Fetches a `tfds.core.DatasetBuilder` by string name. Args: name: `str`, the registered name of the `DatasetBuilder` (the class name as camel or snake case: `MyDataset` or `my_dataset`). This can be either `'dataset_name'` or `'dataset_name/config_name'` for datasets with `BuilderConfig`s. As a convenience, this string may contain comma-separated keyword arguments for the builder. For example `'foo_bar/a=True,b=3'` would use the `FooBar` dataset passing the keyword arguments `a=True` and `b=3` (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to use the `'zoo'` config and pass to the builder keyword arguments `a=True` and `b=3`). try_gcs: `bool`, if True, tfds.load will see if the dataset exists on the public GCS bucket before building it locally. **builder_kwargs: `dict` of keyword arguments passed to the `tfds.core.DatasetBuilder`. Returns: A `tfds.core.DatasetBuilder`. Raises: DatasetNotFoundError: if `name` is unrecognized. """ # 'kaggle:my_dataset:1.0.0' -> ('kaggle', 'my_dataset', {'version': '1.0.0'}) ns_name, builder_name, builder_kwargs = naming.parse_builder_name_kwargs( name, **builder_kwargs) # `try_gcs` currently only support non-community datasets if try_gcs and not ns_name and gcs_utils.is_dataset_on_gcs(builder_name): data_dir = builder_kwargs.get('data_dir') if data_dir: raise ValueError( f'Cannot have both `try_gcs=True` and `data_dir={data_dir}` ' 'explicitly set') builder_kwargs['data_dir'] = gcs_utils.gcs_path('datasets') # Community datasets if ns_name: raise NotImplementedError # First check whether code exists or not (imported datasets) try: cls = builder_cls(builder_name) except registered.DatasetNotFoundError as e: cls = None # Class not found not_found_error = e # Save the exception to eventually reraise # Eventually try loading from files first if _try_load_from_files_first(cls, **builder_kwargs): try: b = read_only_builder.builder_from_files(builder_name, **builder_kwargs) return b except registered.DatasetNotFoundError as e: pass # If code exists and loading from files was skipped (e.g. files not found), # load from the source code. if cls: with py_utils.try_reraise( prefix=f'Failed to construct dataset {name}: '): return cls(**builder_kwargs) # pytype: disable=not-instantiable # If neither the code nor the files are found, raise DatasetNotFoundError raise not_found_error