示例#1
0
def test_builder_from_metadata(code_builder: dataset_builder.DatasetBuilder):
  features = features_dict.FeaturesDict({
      'a': tf.float32,
      'b': tf.string,
  })
  info_proto = dataset_info_pb2.DatasetInfo(
      name='abcd',
      description='efgh',
      config_name='en',
      config_description='something',
      version='0.1.0',
      release_notes={'0.1.0': 'release description'},
      citation='some citation',
      features=features.to_proto())
  builder = read_only_builder.builder_from_metadata(
      code_builder.data_dir, info_proto=info_proto)
  assert builder.name == info_proto.name
  assert builder.info.description == info_proto.description
  assert builder.info.citation == info_proto.citation
  assert builder.info.version == info_proto.version
  assert builder.builder_config
  assert builder.builder_config.name == info_proto.config_name
  assert builder.builder_config.version == info_proto.version
  assert builder.builder_config.description == info_proto.config_description
  assert builder.builder_config.release_notes == info_proto.release_notes
  assert str(builder.info.features) == str(features)
示例#2
0
def test_dataset_info_from_proto():
    builder = RandomShapedImageGenerator(data_dir=testing.make_tmp_dir())
    train = dataset_info_pb2.SplitInfo(name="train",
                                       num_shards=2,
                                       shard_lengths=[4, 5])
    test = dataset_info_pb2.SplitInfo(name="test",
                                      num_shards=3,
                                      shard_lengths=[1, 2, 3])
    text_feature = feature_pb2.Feature(
        python_class_name="tensorflow_datasets.core.features.text_feature.Text",
        text=feature_pb2.TextFeature())
    proto = dataset_info_pb2.DatasetInfo(
        name="random_shaped_image_generator",
        version=str(builder.version),
        features=feature_pb2.Feature(
            python_class_name=
            "tensorflow_datasets.core.features.features_dict.FeaturesDict",
            features_dict=feature_pb2.FeaturesDict(
                features={"text": text_feature})),
        splits=[train, test])
    result = dataset_info.DatasetInfo.from_proto(builder=builder, proto=proto)
    assert result.splits["test"].shard_lengths == test.shard_lengths
    assert result.splits["train"].shard_lengths == train.shard_lengths
    assert set(result.features.keys()) == {"text"}
    assert result.version == builder.version
示例#3
0
    def _parse_dataset_info_proto(
            self, config_name: str,
            config: Mapping[str, Any]) -> dataset_info_pb2.DatasetInfo:
        """Parses a DatasetInfo proto from the given Json."""

        splits = []
        for name, details in config['splits'].items():
            splits.append(
                dataset_info_pb2.SplitInfo(
                    name=name,
                    num_shards=1,
                    shard_lengths=[details['num_examples']],
                    num_bytes=details['num_bytes']))

        if isinstance(config['version'], dict):
            version = config['version']['version_str']
        elif isinstance(config['version'], str):
            version = config['version']
        return dataset_info_pb2.DatasetInfo(
            name=config_name,
            module_name=config_name,
            description=config['description'],
            version=version,
            citation=config['citation'],
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                license=config['license']),
            splits=splits,
            features=_get_huggingface_features(config),
        )
示例#4
0
def read_from_json(json_filename):
  """Read JSON-formatted proto into DatasetInfo proto."""
  with tf.io.gfile.GFile(json_filename) as f:
    dataset_info_json_str = f.read()
  # Parse it back into a proto.
  parsed_proto = json_format.Parse(dataset_info_json_str,
                                   dataset_info_pb2.DatasetInfo())
  return parsed_proto
示例#5
0
    def __init__(self,
                 builder,
                 description=None,
                 features=None,
                 supervised_keys=None,
                 urls=None,
                 citation=None,
                 metadata=None,
                 redistribution_info=None):
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder`, dataset builder for this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: `tuple`, Specifies the input feature and the label for
        supervised learning, if applicable for the dataset.
      urls: `list(str)`, optional, the homepage(s) for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      redistribution_info: `dict`, optional, information needed for
        redistribution, as specified in `dataset_info_pb2.RedistributionInfo`.
        The content of the `license` subfield will automatically be written to a
        LICENSE file stored with the dataset.
    """
        self._builder = builder

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=builder.name,
            description=description,
            version=str(builder._version),  # pylint: disable=protected-access
            citation=citation,
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                **redistribution_info) if redistribution_info else None)
        if urls:
            self._info_proto.location.urls[:] = urls

        self._features = features
        self._splits = splits_lib.SplitDict()
        if supervised_keys is not None:
            assert isinstance(supervised_keys, tuple)
            assert len(supervised_keys) == 2
            self._info_proto.supervised_keys.input = supervised_keys[0]
            self._info_proto.supervised_keys.output = supervised_keys[1]

        if metadata and not isinstance(metadata, Metadata):
            raise ValueError(
                "Metadata should be a `tfds.core.Metadata` instance. Received "
                "{}".format(metadata))
        self._metadata = metadata

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False
示例#6
0
    def __init__(self,
                 builder,
                 description=None,
                 features=None,
                 supervised_keys=None,
                 splits=None,
                 urls=None,
                 download_checksums=None,
                 size_in_bytes=0,
                 citation=None):
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder`, dataset builder for this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: `tuple`, Specifies the input feature and the label for
        supervised learning, if applicable for the dataset.
      splits: `tfds.core.SplitDict`, the available splits for this dataset.
      urls: `list(str)`, optional, the homepage(s) for this dataset.
      download_checksums: `dict<str url, str sha256>`, URL to sha256 of file.
        If a url is not listed, its checksum is not checked.
      size_in_bytes: `int`, optional, approximate size in bytes of the raw
        size of the dataset that we will be downloading from the internet.
      citation: `str`, optional, the citation to use for this dataset.
    """
        self._builder = builder

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=builder.name,
            description=description,
            version=str(builder._version),  # pylint: disable=protected-access
            size_in_bytes=int(size_in_bytes),
            citation=citation)
        if urls:
            self._info_proto.location.urls[:] = urls
        self._info_proto.download_checksums.update(download_checksums or {})

        self._features = features
        self._splits = splits or splits_lib.SplitDict()
        if supervised_keys is not None:
            assert isinstance(supervised_keys, tuple)
            assert len(supervised_keys) == 2
            self._info_proto.supervised_keys.input = supervised_keys[0]
            self._info_proto.supervised_keys.output = supervised_keys[1]

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False
示例#7
0
  def __init__(self,
               name=None,
               description=None,
               features=None,
               supervised_keys=None,
               splits=None,
               urls=None,
               size_in_bytes=0,
               citation=None):
    """Constructor of the DatasetInfo.

    Args:
      name: (`str`) Name of the dataset, usually set to builder.name.
      description: `str`, description of this dataset.
      features: (`tfds.features.FeaturesDict`) Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: (`tuple`) Specifies the input feature and the label for
        supervised learning, if applicable for the dataset.
      splits: `SplitDict`, the available Splits for this dataset.
      urls: `list(str)`, optional, the homepage(s) for this dataset.
      size_in_bytes: `integer`, optional, approximate size in bytes of the raw
        size of the dataset that we will be downloading from the internet.
      citation: `str`, optional, the citation to use for this dataset.
    """
    self._info_proto = dataset_info_pb2.DatasetInfo(
        name=name,
        description=description,
        size_in_bytes=int(size_in_bytes),
        citation=citation)
    if urls:
      self._info_proto.location.urls[:] = urls

    self._features = features
    self._splits = splits or splits_lib.SplitDict()
    if supervised_keys is not None:
      assert isinstance(supervised_keys, tuple)
      assert len(supervised_keys) == 2
      self._info_proto.supervised_keys.input = supervised_keys[0]
      self._info_proto.supervised_keys.output = supervised_keys[1]

    # Is this object initialized with both the static and the dynamic data?
    self._fully_initialized = False
示例#8
0
    def __init__(self,
                 builder,
                 description=None,
                 features=None,
                 supervised_keys=None,
                 urls=None,
                 citation=None):
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder`, dataset builder for this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: `tuple`, Specifies the input feature and the label for
        supervised learning, if applicable for the dataset.
      urls: `list(str)`, optional, the homepage(s) for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
    """
        self._builder = builder

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=builder.name,
            description=description,
            version=str(builder._version),  # pylint: disable=protected-access
            citation=citation)
        if urls:
            self._info_proto.location.urls[:] = urls

        self._features = features
        self._splits = splits_lib.SplitDict()
        if supervised_keys is not None:
            assert isinstance(supervised_keys, tuple)
            assert len(supervised_keys) == 2
            self._info_proto.supervised_keys.input = supervised_keys[0]
            self._info_proto.supervised_keys.output = supervised_keys[1]

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False
示例#9
0
  def read_from_directory(self, dataset_info_dir):
    """Update the DatasetInfo properties from the metadata file.

    This function updates all the dynamically generated fields (num_samples,
    hash, time of creation,...) of the DatasetInfo. This reads the metadata
    file on the dataset directory to extract the info and expose them.
    This function is called after the data has been generated in
    .download_and_prepare() and when the data is loaded and already exists.

    This will overwrite all previous metadata.

    Args:
      dataset_info_dir: `str` The directory containing the metadata file. This
        should be the root directory of a specific dataset version.
    """
    if not dataset_info_dir:
      raise ValueError(
          "Calling read_from_directory with undefined dataset_info_dir.")

    json_filename = self._dataset_info_filename(dataset_info_dir)

    # Load the metadata from disk
    if not tf.gfile.Exists(json_filename):
      return

    with tf.gfile.Open(json_filename, "r") as f:
      dataset_info_json_str = f.read()

    # Parse it back into a proto.
    self._info_proto = json_format.Parse(dataset_info_json_str,
                                         dataset_info_pb2.DatasetInfo())

    # Restore the Splits
    self._splits = splits_lib.SplitDict.from_proto(self._info_proto.splits)

    # Mark as fully initialized.
    self._fully_initialized = True
示例#10
0
  def __init__(self,
               builder,
               description=None,
               features=None,
               supervised_keys=None,
               homepage=None,
               citation=None,
               metadata=None,
               redistribution_info=None):
    """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder`, dataset builder for this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: `tuple` of `(input_key, target_key)`, Specifies the
        input feature and the label for supervised learning, if applicable for
        the dataset. The keys correspond to the feature names to select in
        `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()`
        with `as_supervised=True`, the `tf.data.Dataset` object will yield
        the (input, target) defined here.
      homepage: `str`, optional, the homepage for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      redistribution_info: `dict`, optional, information needed for
        redistribution, as specified in `dataset_info_pb2.RedistributionInfo`.
        The content of the `license` subfield will automatically be written to a
        LICENSE file stored with the dataset.
    """
    self._builder = builder

    self._info_proto = dataset_info_pb2.DatasetInfo(
        name=builder.name,
        description=utils.dedent(description),
        version=str(builder._version),  # pylint: disable=protected-access
        citation=utils.dedent(citation),
        redistribution_info=dataset_info_pb2.RedistributionInfo(
            license=utils.dedent(redistribution_info.pop("license")),
            **redistribution_info) if redistribution_info else None)

    if homepage:
      self._info_proto.location.urls[:] = [homepage]

    if features:
      if not isinstance(features, top_level_feature.TopLevelFeature):
        raise ValueError(
            "DatasetInfo.features only supports FeaturesDict or Sequence at "
            "the top-level. Got {}".format(features))
      features._set_top_level()  # pylint: disable=protected-access
    self._features = features
    self._splits = splits_lib.SplitDict(self._builder.name)
    if supervised_keys is not None:
      assert isinstance(supervised_keys, tuple)
      assert len(supervised_keys) == 2
      self._info_proto.supervised_keys.input = supervised_keys[0]
      self._info_proto.supervised_keys.output = supervised_keys[1]

    if metadata and not isinstance(metadata, Metadata):
      raise ValueError(
          "Metadata should be a `tfds.core.Metadata` instance. Received "
          "{}".format(metadata))
    self._metadata = metadata

    # Is this object initialized with both the static and the dynamic data?
    self._fully_initialized = False
示例#11
0
def read_from_json(path: type_utils.PathLike) -> dataset_info_pb2.DatasetInfo:
    """Read JSON-formatted proto into DatasetInfo proto."""
    json_str = utils.as_path(path).read_text()
    # Parse it back into a proto.
    parsed_proto = json_format.Parse(json_str, dataset_info_pb2.DatasetInfo())
    return parsed_proto
示例#12
0
    def __init__(
            self,
            *,
            builder: Union[DatasetIdentity, Any],
            description: Optional[str] = None,
            features: Optional[feature_lib.FeatureConnector] = None,
            supervised_keys: Optional[SupervisedKeysType] = None,
            disable_shuffling: bool = False,
            homepage: Optional[str] = None,
            citation: Optional[str] = None,
            metadata: Optional[Metadata] = None,
            license: Optional[str] = None,  # pylint: disable=redefined-builtin
            redistribution_info: Optional[Dict[str, str]] = None,
            split_dict: Optional[splits_lib.SplitDict] = None):
        # pyformat: disable
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder` or `DatasetIdentity`. The dataset builder or
        identity will be used to populate this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict of
        the `tf.data.Dataset()` object from the `builder.as_dataset()` method.
      supervised_keys: Specifies the input structure for supervised learning, if
        applicable for the dataset, used with "as_supervised". The keys
        correspond to the feature names to select in `info.features`. When
        calling `tfds.core.DatasetBuilder.as_dataset()` with
        `as_supervised=True`, the `tf.data.Dataset` object will yield the
        structure defined by the keys passed here, instead of that defined by
        the `features` argument. Typically this is a `(input_key, target_key)`
        tuple, and the dataset yields a tuple of tensors `(input, target)`
        tensors.

        To yield a more complex structure, pass a tuple of `tf.nest` compatible
        structures of feature keys. The resulting `Dataset` will yield
        structures with each key replaced by the coresponding tensor. For
        example, passing a triple of keys would return a dataset
        that yields `(feature, target, sample_weights)` triples for keras.
        Using `supervised_keys=({'a':'a','b':'b'}, 'c')` would create a dataset
        yielding a tuple with a dictionary of features in the `features`
        position.

        Note that selecting features in nested `tfds.features.FeaturesDict`
        objects is not supported.
      disable_shuffling: `bool`, specify whether to shuffle the examples.
      homepage: `str`, optional, the homepage for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      license: license of the dataset.
      redistribution_info: information needed for redistribution, as specified
        in `dataset_info_pb2.RedistributionInfo`. The content of the `license`
        subfield will automatically be written to a LICENSE file stored with the
        dataset.
      split_dict: information about the splits in this dataset.
    """
        # pyformat: enable
        self._builder_or_identity = builder
        if isinstance(builder, DatasetIdentity):
            self._identity = builder
        else:
            self._identity = DatasetIdentity.from_builder(builder)

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=self._identity.name,
            description=utils.dedent(description),
            version=str(self._identity.version),
            release_notes=self._identity.release_notes,
            disable_shuffling=disable_shuffling,
            config_name=self._identity.config_name,
            config_description=self._identity.config_description,
            citation=utils.dedent(citation),
            module_name=self._identity.module_name,
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                license=utils.dedent(license
                                     or redistribution_info.pop("license")),
                **redistribution_info) if redistribution_info else None)

        if homepage:
            self._info_proto.location.urls[:] = [homepage]

        if features:
            if not isinstance(features, top_level_feature.TopLevelFeature):
                raise ValueError(
                    "DatasetInfo.features only supports FeaturesDict or Sequence at "
                    "the top-level. Got {}".format(features))
        self._features = features
        self._splits = splits_lib.SplitDict([])
        if split_dict:
            self.set_splits(split_dict)
        if supervised_keys is not None:
            self._info_proto.supervised_keys.CopyFrom(
                _supervised_keys_to_proto(supervised_keys))

        if metadata and not isinstance(metadata, Metadata):
            raise ValueError(
                "Metadata should be a `tfds.core.Metadata` instance. Received "
                "{}".format(metadata))
        self._metadata = metadata

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False
示例#13
0
    def read_from_directory(self, dataset_info_dir, from_packaged_data=False):
        """Update DatasetInfo from the JSON file in `dataset_info_dir`.

    This function updates all the dynamically generated fields (num_examples,
    hash, time of creation,...) of the DatasetInfo.

    This will overwrite all previous metadata.

    Args:
      dataset_info_dir: `str` The directory containing the metadata file. This
        should be the root directory of a specific dataset version.
      from_packaged_data: `bool`, If data is restored from packaged data,
        then only the informations not defined in the code are updated

    Returns:
      True if we were able to initialize using `dataset_info_dir`, else false.
    """
        if not dataset_info_dir:
            raise ValueError(
                "Calling read_from_directory with undefined dataset_info_dir.")

        json_filename = self._dataset_info_filename(dataset_info_dir)

        # Load the metadata from disk
        if not tf.gfile.Exists(json_filename):
            return False

        with tf.gfile.Open(json_filename, "r") as f:
            dataset_info_json_str = f.read()

        # Parse it back into a proto.
        parsed_proto = json_format.Parse(dataset_info_json_str,
                                         dataset_info_pb2.DatasetInfo())

        # Update splits
        self.splits = splits_lib.SplitDict.from_proto(parsed_proto.splits)
        # Update schema
        self.as_proto.schema.CopyFrom(parsed_proto.schema)
        # Restore the feature metadata (vocabulary, labels names,...)
        if self.features:
            self.features.load_metadata(dataset_info_dir)
        # Restore download info
        self.download_checksums = parsed_proto.download_checksums
        self.size_in_bytes = parsed_proto.size_in_bytes

        # If we are restoring on-disk data, then we also restore all dataste info
        # information from the previously saved proto.
        # If we are loading from packaged data (only possible when we do not
        # restore previous data), then do not restore the info which are already
        # defined in the code. Otherwise, we would overwrite code info.
        if not from_packaged_data:
            # Update the full proto
            self._info_proto = parsed_proto

        if self._builder._version != self.version:  # pylint: disable=protected-access
            raise AssertionError(
                "The constructed DatasetInfo instance and the restored proto version "
                "do not match. Builder version: {}. Proto version: {}".format(
                    self._builder._version, self.version))  # pylint: disable=protected-access

        # Mark as fully initialized.
        self._fully_initialized = True

        return True