def _build_data_dir(self, given_data_dir): """Return the data directory for the current version. Args: given_data_dir: `Optional[str]`, root `data_dir` passed as `__init__` argument. Returns: data_dir_root: `str`, The root dir containing all datasets, downloads,... data_dir: `str`, The version data_dir (e.g. `<data_dir_root>/<ds_name>/<config>/<version>`) """ builder_dir = self._relative_data_dir(with_version=False) version_dir = self._relative_data_dir(with_version=True) default_data_dir = constants.get_default_data_dir( given_data_dir=given_data_dir ) all_data_dirs = constants.list_data_dirs(given_data_dir=given_data_dir) all_versions = set() requested_version_dirs = {} for data_dir_root in all_data_dirs: # List all existing versions full_builder_dir = os.path.join(data_dir_root, builder_dir) data_dir_versions = set(utils.version.list_all_versions(full_builder_dir)) # Check for existance of the requested version if self.version in data_dir_versions: requested_version_dirs[data_dir_root] = os.path.join( data_dir_root, version_dir ) all_versions.update(data_dir_versions) if len(requested_version_dirs) > 1: raise ValueError( "Dataset was found in more than one directory: {}. Please resolve " "the ambiguity by explicitly specifying `data_dir=`." "".format(requested_version_dirs.values()) ) elif len(requested_version_dirs) == 1: # The dataset is found once return next(iter(requested_version_dirs.items())) # No dataset found, use default directory data_dir = os.path.join(default_data_dir, version_dir) if all_versions: logging.warning( "Found a different version of the requested dataset:\n" "%s\n" "Using %s instead.", "\n".join(str(v) for v in sorted(all_versions)), data_dir ) return default_data_dir, data_dir
def find_builder_dir( name: str, *, data_dir: Optional[str] = None, ) -> Optional[str]: """Search whether the given dataset is present on disk and return its path. Note: * If the dataset is present, but is legacy (no feature config file), None is returned. * If the config isn't specified, the function try to infer the default config name from the original `DatasetBuilder`. * The function searches in all `data_dir` registered with `tfds.core.add_data_dir`. If the dataset exists in multiple dirs, an error is raised. Args: name: Builder name (e.g. `my_ds`, `my_ds/config`, `my_ds:1.2.0`,...) data_dir: Path where to search for the dataset (e.g. `~/tensorflow_datasets`). Returns: path: The dataset path found, or None if the dataset isn't found. """ # Search the dataset across all registered data_dirs all_builder_dirs = [] for current_data_dir in constants.list_data_dirs(given_data_dir=data_dir): builder_dir = _find_builder_dir_single_dir( name, data_dir=current_data_dir ) if builder_dir: all_builder_dirs.append(builder_dir) if not all_builder_dirs: return None elif len(all_builder_dirs) != 1: # Rather than raising error every time, we could potentially be smarter # and load the most recent version across all files, but should be # carefull when partial version is requested ('my_dataset:3.*.*'). # Could add some `MultiDataDirManager` API: # ``` # manager = MultiDataDirManager(given_data_dir=data_dir) # with manager.merge_data_dirs() as virtual_data_dir: # virtual_builder_dir = _find_builder_dir(name, data_dir=virtual_data_dir) # builder_dir = manager.resolve(virtual_builder_dir) # ``` raise ValueError( f"Dataset {name} detected in multiple locations: {all_builder_dirs}. " "Please resolve the ambiguity by explicitly setting `data_dir=`." ) else: return next(iter(all_builder_dirs)) # List has a single element
def builder_from_files( name: str, **builder_kwargs: Any, ) -> dataset_builder.DatasetBuilder: """Loads a `tfds.core.DatasetBuilder` from files, auto-infering location. This function is similar to `tfds.builder` (same signature), but create the `tfds.core.DatasetBuilder` directly from files, without loading original generation source code. It does not supports: * namespaces (e.g. 'kaggle:dataset') * config objects (`dataset/config` valid, but not `config=MyConfig()`) * `version='experimental_latest'` Args: name: Dataset name. **builder_kwargs: `tfds.core.DatasetBuilder` kwargs. Returns: builder: The loaded dataset builder. Raises: DatasetNotFoundError: If the dataset cannot be loaded. """ # Find and load dataset builder. builder_dir = _find_builder_dir(name, **builder_kwargs) if builder_dir is not None: # A generated dataset was found on disk return builder_from_directory(builder_dir) else: data_dirs = constants.list_data_dirs( given_data_dir=builder_kwargs.get('data_dir')) raise registered.DatasetNotFoundError( f'Could not find dataset files for: {name}. Make sure the dataset ' f'has been generated in: {data_dirs}. If the dataset has configs, you ' 'might have to specify the config name.')
def _find_builder_dir(name: str, **builder_kwargs: Any) -> Optional[str]: """Search whether the given dataset is present on disk and return its path. Note: * If the dataset is present, but is legacy (no feature config file), None is returned. * If the config isn't specified, the function try to infer the default config name from the original `DatasetBuilder`. * The function searches in all `data_dir` registered with `tfds.core.add_data_dir`. If the dataset exists in multiple dirs, an error is raised. Args: name: Builder name (e.g. `my_ds`, `my_ds/config`, `my_ds:1.2.0`,...) **builder_kwargs: `tfds.core.DatasetBuilder` kwargs. Returns: path: The dataset path found, or None if the dataset isn't found. """ # Normalize builder kwargs ns_name, ds_name, builder_kwargs = naming.parse_builder_name_kwargs( name, **builder_kwargs) version = builder_kwargs.pop('version', None) config = builder_kwargs.pop('config', None) data_dir = builder_kwargs.pop('data_dir', None) # Builder cannot be found if it uses: # * namespace # * version='experimental_latest' # * config objects (rather than `str`) # * custom DatasetBuilder.__init__ kwargs if (ns_name or version == 'experimental_latest' or isinstance(config, dataset_builder.BuilderConfig) or builder_kwargs): return None # Search the dataset across all registered data_dirs all_builder_dirs = [] for current_data_dir in constants.list_data_dirs(given_data_dir=data_dir): builder_dir = _find_builder_dir_single_dir( ds_name, data_dir=current_data_dir, version_str=str(version) if version else None, config_name=config, ) if builder_dir: all_builder_dirs.append(builder_dir) if not all_builder_dirs: return None elif len(all_builder_dirs) != 1: # Rather than raising error every time, we could potentially be smarter # and load the most recent version across all files, but should be # carefull when partial version is requested ('my_dataset:3.*.*'). # Could add some `MultiDataDirManager` API: # ``` # manager = MultiDataDirManager(given_data_dir=data_dir) # with manager.merge_data_dirs() as virtual_data_dir: # virtual_builder_dir = _find_builder_dir(name, data_dir=virtual_data_dir) # builder_dir = manager.resolve(virtual_builder_dir) # ``` raise ValueError( f'Dataset {name} detected in multiple locations: {all_builder_dirs}. ' 'Please resolve the ambiguity by explicitly setting `data_dir=`.') else: return next(iter(all_builder_dirs)) # List has a single element