예제 #1
0
def _find_builder_dir_single_dir(
    builder_name: str,
    *,
    data_dir: str,
    config_name: Optional[str] = None,
    version_str: Optional[str] = None,
) -> Optional[str]:
    """Same as `find_builder_dir` but requires explicit dir."""
    # Construct the `ds_name/config/` path
    builder_dir = os.path.join(data_dir, builder_name)
    if not config_name:
        # If the BuilderConfig is not specified:
        # * Either the dataset doesn't have a config
        # * Either the default config should be used
        # Currently, in order to infer the default config, we are still relying on
        # the code.
        # TODO(tfds): How to avoid code dependency and automatically infer the
        # config existence and name?
        config_name = _get_default_config_name(builder_dir, builder_name)

    # If has config (explicitly given or default config), append it to the path
    if config_name:
        builder_dir = os.path.join(builder_dir, config_name)

    # Extract the version
    version_str = _get_version_str(builder_dir, requested_version=version_str)

    if not version_str:  # Version not given or found
        return None

    builder_dir = os.path.join(builder_dir, version_str)

    # Check for builder dir existence
    try:
        if not tf.io.gfile.exists(builder_dir):
            return None
    except tf.errors.PermissionDeniedError:
        return None

    # Backward compatibility, in order to be a valid ReadOnlyBuilder, the folder
    # has to contain the feature configuration.
    if not tf.io.gfile.exists(feature_lib.make_config_path(builder_dir)):
        return None

    return builder_dir
예제 #2
0
def _find_builder_dir_single_dir(
    name: str,
    *,
    data_dir: str,
) -> Optional[str]:
  """Same as `find_builder_dir` but require explicit dir."""
  builder_name, builder_kwargs = _dataset_name_and_kwargs_from_name_str(name)
  config_name = builder_kwargs.pop("config", None)
  version_str = builder_kwargs.pop("version", None)
  if builder_kwargs:
    # Datasets with additional kwargs require the original builder code.
    return None

  # Construct the `ds_name/config/` path
  builder_dir = os.path.join(data_dir, builder_name)
  if not config_name:
    # If the BuilderConfig is not specified:
    # * Either the dataset don't have config
    # * Either the default config should be used
    # Currently, in order to infer the default config, we are still relying on
    # the code.
    # TODO(tfds): How to avoid code dependency and automatically infer the
    # config existance and name ?
    config_name = _get_default_config_name(builder_name)

  # If has config (explicitly given or default config), append it to the path
  if config_name:
    builder_dir = os.path.join(builder_dir, config_name)

  # Extract the version
  version_str = _get_version_str(builder_dir, requested_version=version_str)

  if not version_str:  # Version not given or found
    return None

  builder_dir = os.path.join(builder_dir, version_str)

  # Check for builder dir existance
  if not tf.io.gfile.exists(builder_dir):
    return None
  # Backward compatibility, in order to be a valid ReadOnlyBuilder, the folder
  # has to contain the feature configuration.
  if not tf.io.gfile.exists(feature_lib.make_config_path(builder_dir)):
    return None
  return builder_dir
예제 #3
0
    def read_from_directory(self, dataset_info_dir):
        """Update DatasetInfo from the JSON file in `dataset_info_dir`.

    This function updates all the dynamically generated fields (num_examples,
    hash, time of creation,...) of the DatasetInfo.

    This will overwrite all previous metadata.

    Args:
      dataset_info_dir: `str` The directory containing the metadata file. This
        should be the root directory of a specific dataset version.

    Raises:
      FileNotFoundError: If the file can't be found.
    """
        logging.info("Load dataset info from %s", dataset_info_dir)

        json_filename = self._dataset_info_path(dataset_info_dir)
        if not tf.io.gfile.exists(json_filename):
            raise FileNotFoundError(
                "Try to load `DatasetInfo` from a directory which does not exist or "
                "does not contain `dataset_info.json`. Please delete the directory "
                f"`{dataset_info_dir}`  if you are trying to re-generate the "
                "dataset.")

        # Load the metadata from disk
        parsed_proto = read_from_json(json_filename)

        # Update splits
        split_dict = splits_lib.SplitDict.from_proto(self.name,
                                                     parsed_proto.splits)
        self.set_splits(split_dict)

        # Restore the feature metadata (vocabulary, labels names,...)
        if self.features:
            self.features.load_metadata(dataset_info_dir)
        # For `ReadOnlyBuilder`, reconstruct the features from the config.
        elif tf.io.gfile.exists(
                feature_lib.make_config_path(dataset_info_dir)):
            self._features = feature_lib.FeatureConnector.from_config(
                dataset_info_dir)
        if self.metadata is not None:
            self.metadata.load_metadata(dataset_info_dir)

        # Update fields which are not defined in the code. This means that
        # the code will overwrite fields which are present in
        # dataset_info.json.
        for field_name, field in self.as_proto.DESCRIPTOR.fields_by_name.items(
        ):
            field_value = getattr(self._info_proto, field_name)
            field_value_restored = getattr(parsed_proto, field_name)

            try:
                is_defined = self._info_proto.HasField(field_name)
            except ValueError:
                is_defined = bool(field_value)

            try:
                is_defined_in_restored = parsed_proto.HasField(field_name)
            except ValueError:
                is_defined_in_restored = bool(field_value_restored)

            # If field is defined in code, we ignore the value
            if is_defined:
                if field_value != field_value_restored:
                    logging.info(
                        "Field info.%s from disk and from code do not match. Keeping "
                        "the one from code.", field_name)
                continue
            # If the field is also not defined in JSON file, we do nothing
            if not is_defined_in_restored:
                continue
            # Otherwise, we restore the dataset_info.json value
            if field.type == field.TYPE_MESSAGE:
                field_value.MergeFrom(field_value_restored)
            else:
                setattr(self._info_proto, field_name, field_value_restored)

        if self._builder._version != self.version:  # pylint: disable=protected-access
            raise AssertionError(
                "The constructed DatasetInfo instance and the restored proto version "
                "do not match. Builder version: {}. Proto version: {}".format(
                    self._builder._version, self.version))  # pylint: disable=protected-access

        # Mark as fully initialized.
        self._fully_initialized = True
예제 #4
0
def find_builder_dir(
    name: str,
    *,
    data_dir: str,
) -> Optional[str]:
    """Search whether the given dataset is present on disk and return its path.

  Note:

   * If the dataset is present, but is legacy (no feature config file), None
     is returned.
   * If the config isn't specified, the function try to infer the default
     config name from the original `DatasetBuilder`.

  Args:
    name: Builder name (e.g. `my_ds`, `my_ds/config`, `my_ds:1.2.0`,...)
    data_dir: Path where to search for the dataset
      (e.g. `~/tensorflow_datasets`).

  Returns:
    path: The dataset path found, or None if the dataset isn't found.
  """
    builder_name, builder_kwargs = _dataset_name_and_kwargs_from_name_str(name)
    config_name = builder_kwargs.pop("config", None)
    version_str = builder_kwargs.pop("version", None)
    if builder_kwargs:
        # Datasets with additional kwargs require the original builder code.
        return None

    # Construct the `ds_name/config/` path
    builder_dir = os.path.join(data_dir, builder_name)
    if not config_name:
        # If the BuilderConfig is not specified:
        # * Either the dataset don't have config
        # * Either the default config should be used
        # Currently, in order to infer the default config, we are still relying on
        # the code.
        # TODO(tfds): How to avoid code dependency and automatically infer the
        # config existance and name ?
        config_name = _get_default_config_name(builder_name)

    # If has config (explicitly given or default config), append it to the path
    if config_name:
        builder_dir = os.path.join(builder_dir, config_name)

    # If version not given, extract the version
    if not version_str:
        version_str = _get_last_version(builder_dir)

    if not version_str:  # No version given nor found
        return None

    builder_dir = os.path.join(builder_dir, version_str)

    # Check for builder dir existance
    if not tf.io.gfile.exists(builder_dir):
        return None
    # Backward compatibility, in order to be a valid ReadOnlyBuilder, the folder
    # has to contain the feature configuration.
    if not tf.io.gfile.exists(feature_lib.make_config_path(builder_dir)):
        return None
    return builder_dir
예제 #5
0
    def read_from_directory(self, dataset_info_dir: str) -> None:
        """Update DatasetInfo from the JSON files in `dataset_info_dir`.

    This function updates all the dynamically generated fields (num_examples,
    hash, time of creation,...) of the DatasetInfo.

    This will overwrite all previous metadata.

    Args:
      dataset_info_dir: `str` The directory containing the metadata file. This
        should be the root directory of a specific dataset version.

    Raises:
      FileNotFoundError: If the dataset_info.json can't be found.
    """
        logging.info("Load dataset info from %s", dataset_info_dir)

        json_filename = dataset_info_path(dataset_info_dir)
        if not tf.io.gfile.exists(json_filename):
            raise FileNotFoundError(
                "Tried to load `DatasetInfo` from a directory which does not exist or"
                " does not contain `dataset_info.json`. Please delete the directory "
                f"`{dataset_info_dir}`  if you are trying to re-generate the "
                "dataset.")

        # Load the metadata from disk
        parsed_proto = read_from_json(json_filename)

        if str(self.version) != parsed_proto.version:
            raise AssertionError(
                "The constructed DatasetInfo instance and the restored proto version "
                "do not match. Builder version: {}. Proto version: {}".format(
                    self.version, parsed_proto.version))

        self._identity = DatasetIdentity.from_proto(info_proto=parsed_proto,
                                                    data_dir=dataset_info_dir)

        # Update splits
        filename_template = naming.ShardedFileTemplate(
            dataset_name=self.name,
            data_dir=self.data_dir,
            filetype_suffix=parsed_proto.file_format or "tfrecord")
        split_dict = splits_lib.SplitDict.from_proto(
            repeated_split_infos=parsed_proto.splits,
            filename_template=filename_template)
        self.set_splits(split_dict)

        # Restore the feature metadata (vocabulary, labels names,...)
        if self.features:
            self.features.load_metadata(dataset_info_dir)
        # For `ReadOnlyBuilder`, reconstruct the features from the config.
        elif tf.io.gfile.exists(
                feature_lib.make_config_path(dataset_info_dir)):
            self._features = feature_lib.FeatureConnector.from_config(
                dataset_info_dir)
        # Restore the MetaDataDict from metadata.json if there is any
        if (self.metadata is not None
                or tf.io.gfile.exists(_metadata_filepath(dataset_info_dir))):
            # If the dataset was loaded from file, self.metadata will be `None`, so
            # we create a MetadataDict first.
            if self.metadata is None:
                self._metadata = MetadataDict()
            self.metadata.load_metadata(dataset_info_dir)

        # Update fields which are not defined in the code. This means that
        # the code will overwrite fields which are present in
        # dataset_info.json.
        for field_name, field in self.as_proto.DESCRIPTOR.fields_by_name.items(
        ):
            field_value = getattr(self._info_proto, field_name)
            field_value_restored = getattr(parsed_proto, field_name)

            try:
                is_defined = self._info_proto.HasField(field_name)
            except ValueError:
                is_defined = bool(field_value)

            try:
                is_defined_in_restored = parsed_proto.HasField(field_name)
            except ValueError:
                is_defined_in_restored = bool(field_value_restored)

            # If field is defined in code, we ignore the value.
            if is_defined:
                if field_value != field_value_restored:
                    logging.info(
                        "Field info.%s from disk and from code do not match. "
                        "Keeping the one from code.", field_name)
                continue
            # If the field is also not defined in JSON file, we do nothing
            if not is_defined_in_restored:
                continue
            # Otherwise, we restore the dataset_info.json value
            if field.type == field.TYPE_MESSAGE:
                field_value.MergeFrom(field_value_restored)
            else:
                setattr(self._info_proto, field_name, field_value_restored)

        # Mark as fully initialized.
        self._fully_initialized = True
예제 #6
0
def _contains_dataset(dataset_dir: epath.PathLike) -> bool:
  try:
    return tf.io.gfile.exists(feature_lib.make_config_path(dataset_dir))
  except (OSError, tf.errors.PermissionDeniedError):
    return False