def _find_builder_dir_single_dir( builder_name: str, *, data_dir: str, config_name: Optional[str] = None, version_str: Optional[str] = None, ) -> Optional[str]: """Same as `find_builder_dir` but requires explicit dir.""" # Construct the `ds_name/config/` path builder_dir = os.path.join(data_dir, builder_name) if not config_name: # If the BuilderConfig is not specified: # * Either the dataset doesn't have a config # * Either the default config should be used # Currently, in order to infer the default config, we are still relying on # the code. # TODO(tfds): How to avoid code dependency and automatically infer the # config existence and name? config_name = _get_default_config_name(builder_dir, builder_name) # If has config (explicitly given or default config), append it to the path if config_name: builder_dir = os.path.join(builder_dir, config_name) # Extract the version version_str = _get_version_str(builder_dir, requested_version=version_str) if not version_str: # Version not given or found return None builder_dir = os.path.join(builder_dir, version_str) # Check for builder dir existence try: if not tf.io.gfile.exists(builder_dir): return None except tf.errors.PermissionDeniedError: return None # Backward compatibility, in order to be a valid ReadOnlyBuilder, the folder # has to contain the feature configuration. if not tf.io.gfile.exists(feature_lib.make_config_path(builder_dir)): return None return builder_dir
def _find_builder_dir_single_dir( name: str, *, data_dir: str, ) -> Optional[str]: """Same as `find_builder_dir` but require explicit dir.""" builder_name, builder_kwargs = _dataset_name_and_kwargs_from_name_str(name) config_name = builder_kwargs.pop("config", None) version_str = builder_kwargs.pop("version", None) if builder_kwargs: # Datasets with additional kwargs require the original builder code. return None # Construct the `ds_name/config/` path builder_dir = os.path.join(data_dir, builder_name) if not config_name: # If the BuilderConfig is not specified: # * Either the dataset don't have config # * Either the default config should be used # Currently, in order to infer the default config, we are still relying on # the code. # TODO(tfds): How to avoid code dependency and automatically infer the # config existance and name ? config_name = _get_default_config_name(builder_name) # If has config (explicitly given or default config), append it to the path if config_name: builder_dir = os.path.join(builder_dir, config_name) # Extract the version version_str = _get_version_str(builder_dir, requested_version=version_str) if not version_str: # Version not given or found return None builder_dir = os.path.join(builder_dir, version_str) # Check for builder dir existance if not tf.io.gfile.exists(builder_dir): return None # Backward compatibility, in order to be a valid ReadOnlyBuilder, the folder # has to contain the feature configuration. if not tf.io.gfile.exists(feature_lib.make_config_path(builder_dir)): return None return builder_dir
def read_from_directory(self, dataset_info_dir): """Update DatasetInfo from the JSON file in `dataset_info_dir`. This function updates all the dynamically generated fields (num_examples, hash, time of creation,...) of the DatasetInfo. This will overwrite all previous metadata. Args: dataset_info_dir: `str` The directory containing the metadata file. This should be the root directory of a specific dataset version. Raises: FileNotFoundError: If the file can't be found. """ logging.info("Load dataset info from %s", dataset_info_dir) json_filename = self._dataset_info_path(dataset_info_dir) if not tf.io.gfile.exists(json_filename): raise FileNotFoundError( "Try to load `DatasetInfo` from a directory which does not exist or " "does not contain `dataset_info.json`. Please delete the directory " f"`{dataset_info_dir}` if you are trying to re-generate the " "dataset.") # Load the metadata from disk parsed_proto = read_from_json(json_filename) # Update splits split_dict = splits_lib.SplitDict.from_proto(self.name, parsed_proto.splits) self.set_splits(split_dict) # Restore the feature metadata (vocabulary, labels names,...) if self.features: self.features.load_metadata(dataset_info_dir) # For `ReadOnlyBuilder`, reconstruct the features from the config. elif tf.io.gfile.exists( feature_lib.make_config_path(dataset_info_dir)): self._features = feature_lib.FeatureConnector.from_config( dataset_info_dir) if self.metadata is not None: self.metadata.load_metadata(dataset_info_dir) # Update fields which are not defined in the code. This means that # the code will overwrite fields which are present in # dataset_info.json. for field_name, field in self.as_proto.DESCRIPTOR.fields_by_name.items( ): field_value = getattr(self._info_proto, field_name) field_value_restored = getattr(parsed_proto, field_name) try: is_defined = self._info_proto.HasField(field_name) except ValueError: is_defined = bool(field_value) try: is_defined_in_restored = parsed_proto.HasField(field_name) except ValueError: is_defined_in_restored = bool(field_value_restored) # If field is defined in code, we ignore the value if is_defined: if field_value != field_value_restored: logging.info( "Field info.%s from disk and from code do not match. Keeping " "the one from code.", field_name) continue # If the field is also not defined in JSON file, we do nothing if not is_defined_in_restored: continue # Otherwise, we restore the dataset_info.json value if field.type == field.TYPE_MESSAGE: field_value.MergeFrom(field_value_restored) else: setattr(self._info_proto, field_name, field_value_restored) if self._builder._version != self.version: # pylint: disable=protected-access raise AssertionError( "The constructed DatasetInfo instance and the restored proto version " "do not match. Builder version: {}. Proto version: {}".format( self._builder._version, self.version)) # pylint: disable=protected-access # Mark as fully initialized. self._fully_initialized = True
def find_builder_dir( name: str, *, data_dir: str, ) -> Optional[str]: """Search whether the given dataset is present on disk and return its path. Note: * If the dataset is present, but is legacy (no feature config file), None is returned. * If the config isn't specified, the function try to infer the default config name from the original `DatasetBuilder`. Args: name: Builder name (e.g. `my_ds`, `my_ds/config`, `my_ds:1.2.0`,...) data_dir: Path where to search for the dataset (e.g. `~/tensorflow_datasets`). Returns: path: The dataset path found, or None if the dataset isn't found. """ builder_name, builder_kwargs = _dataset_name_and_kwargs_from_name_str(name) config_name = builder_kwargs.pop("config", None) version_str = builder_kwargs.pop("version", None) if builder_kwargs: # Datasets with additional kwargs require the original builder code. return None # Construct the `ds_name/config/` path builder_dir = os.path.join(data_dir, builder_name) if not config_name: # If the BuilderConfig is not specified: # * Either the dataset don't have config # * Either the default config should be used # Currently, in order to infer the default config, we are still relying on # the code. # TODO(tfds): How to avoid code dependency and automatically infer the # config existance and name ? config_name = _get_default_config_name(builder_name) # If has config (explicitly given or default config), append it to the path if config_name: builder_dir = os.path.join(builder_dir, config_name) # If version not given, extract the version if not version_str: version_str = _get_last_version(builder_dir) if not version_str: # No version given nor found return None builder_dir = os.path.join(builder_dir, version_str) # Check for builder dir existance if not tf.io.gfile.exists(builder_dir): return None # Backward compatibility, in order to be a valid ReadOnlyBuilder, the folder # has to contain the feature configuration. if not tf.io.gfile.exists(feature_lib.make_config_path(builder_dir)): return None return builder_dir
def read_from_directory(self, dataset_info_dir: str) -> None: """Update DatasetInfo from the JSON files in `dataset_info_dir`. This function updates all the dynamically generated fields (num_examples, hash, time of creation,...) of the DatasetInfo. This will overwrite all previous metadata. Args: dataset_info_dir: `str` The directory containing the metadata file. This should be the root directory of a specific dataset version. Raises: FileNotFoundError: If the dataset_info.json can't be found. """ logging.info("Load dataset info from %s", dataset_info_dir) json_filename = dataset_info_path(dataset_info_dir) if not tf.io.gfile.exists(json_filename): raise FileNotFoundError( "Tried to load `DatasetInfo` from a directory which does not exist or" " does not contain `dataset_info.json`. Please delete the directory " f"`{dataset_info_dir}` if you are trying to re-generate the " "dataset.") # Load the metadata from disk parsed_proto = read_from_json(json_filename) if str(self.version) != parsed_proto.version: raise AssertionError( "The constructed DatasetInfo instance and the restored proto version " "do not match. Builder version: {}. Proto version: {}".format( self.version, parsed_proto.version)) self._identity = DatasetIdentity.from_proto(info_proto=parsed_proto, data_dir=dataset_info_dir) # Update splits filename_template = naming.ShardedFileTemplate( dataset_name=self.name, data_dir=self.data_dir, filetype_suffix=parsed_proto.file_format or "tfrecord") split_dict = splits_lib.SplitDict.from_proto( repeated_split_infos=parsed_proto.splits, filename_template=filename_template) self.set_splits(split_dict) # Restore the feature metadata (vocabulary, labels names,...) if self.features: self.features.load_metadata(dataset_info_dir) # For `ReadOnlyBuilder`, reconstruct the features from the config. elif tf.io.gfile.exists( feature_lib.make_config_path(dataset_info_dir)): self._features = feature_lib.FeatureConnector.from_config( dataset_info_dir) # Restore the MetaDataDict from metadata.json if there is any if (self.metadata is not None or tf.io.gfile.exists(_metadata_filepath(dataset_info_dir))): # If the dataset was loaded from file, self.metadata will be `None`, so # we create a MetadataDict first. if self.metadata is None: self._metadata = MetadataDict() self.metadata.load_metadata(dataset_info_dir) # Update fields which are not defined in the code. This means that # the code will overwrite fields which are present in # dataset_info.json. for field_name, field in self.as_proto.DESCRIPTOR.fields_by_name.items( ): field_value = getattr(self._info_proto, field_name) field_value_restored = getattr(parsed_proto, field_name) try: is_defined = self._info_proto.HasField(field_name) except ValueError: is_defined = bool(field_value) try: is_defined_in_restored = parsed_proto.HasField(field_name) except ValueError: is_defined_in_restored = bool(field_value_restored) # If field is defined in code, we ignore the value. if is_defined: if field_value != field_value_restored: logging.info( "Field info.%s from disk and from code do not match. " "Keeping the one from code.", field_name) continue # If the field is also not defined in JSON file, we do nothing if not is_defined_in_restored: continue # Otherwise, we restore the dataset_info.json value if field.type == field.TYPE_MESSAGE: field_value.MergeFrom(field_value_restored) else: setattr(self._info_proto, field_name, field_value_restored) # Mark as fully initialized. self._fully_initialized = True
def _contains_dataset(dataset_dir: epath.PathLike) -> bool: try: return tf.io.gfile.exists(feature_lib.make_config_path(dataset_dir)) except (OSError, tf.errors.PermissionDeniedError): return False