示例#1
0
    def _parse_dataset_info_proto(
            self, config_name: str,
            config: Mapping[str, Any]) -> dataset_info_pb2.DatasetInfo:
        """Parses a DatasetInfo proto from the given Json."""

        splits = []
        for name, details in config['splits'].items():
            splits.append(
                dataset_info_pb2.SplitInfo(
                    name=name,
                    num_shards=1,
                    shard_lengths=[details['num_examples']],
                    num_bytes=details['num_bytes']))

        if isinstance(config['version'], dict):
            version = config['version']['version_str']
        elif isinstance(config['version'], str):
            version = config['version']
        return dataset_info_pb2.DatasetInfo(
            name=config_name,
            module_name=config_name,
            description=config['description'],
            version=version,
            citation=config['citation'],
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                license=config['license']),
            splits=splits,
            features=_get_huggingface_features(config),
        )
示例#2
0
    def __init__(self,
                 builder,
                 description=None,
                 features=None,
                 supervised_keys=None,
                 urls=None,
                 citation=None,
                 metadata=None,
                 redistribution_info=None):
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder`, dataset builder for this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: `tuple`, Specifies the input feature and the label for
        supervised learning, if applicable for the dataset.
      urls: `list(str)`, optional, the homepage(s) for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      redistribution_info: `dict`, optional, information needed for
        redistribution, as specified in `dataset_info_pb2.RedistributionInfo`.
        The content of the `license` subfield will automatically be written to a
        LICENSE file stored with the dataset.
    """
        self._builder = builder

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=builder.name,
            description=description,
            version=str(builder._version),  # pylint: disable=protected-access
            citation=citation,
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                **redistribution_info) if redistribution_info else None)
        if urls:
            self._info_proto.location.urls[:] = urls

        self._features = features
        self._splits = splits_lib.SplitDict()
        if supervised_keys is not None:
            assert isinstance(supervised_keys, tuple)
            assert len(supervised_keys) == 2
            self._info_proto.supervised_keys.input = supervised_keys[0]
            self._info_proto.supervised_keys.output = supervised_keys[1]

        if metadata and not isinstance(metadata, Metadata):
            raise ValueError(
                "Metadata should be a `tfds.core.Metadata` instance. Received "
                "{}".format(metadata))
        self._metadata = metadata

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False
示例#3
0
  def __init__(self,
               builder,
               description=None,
               features=None,
               supervised_keys=None,
               homepage=None,
               citation=None,
               metadata=None,
               redistribution_info=None):
    """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder`, dataset builder for this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict
        of the `tf.data.Dataset()` object from the `builder.as_dataset()`
        method.
      supervised_keys: `tuple` of `(input_key, target_key)`, Specifies the
        input feature and the label for supervised learning, if applicable for
        the dataset. The keys correspond to the feature names to select in
        `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()`
        with `as_supervised=True`, the `tf.data.Dataset` object will yield
        the (input, target) defined here.
      homepage: `str`, optional, the homepage for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      redistribution_info: `dict`, optional, information needed for
        redistribution, as specified in `dataset_info_pb2.RedistributionInfo`.
        The content of the `license` subfield will automatically be written to a
        LICENSE file stored with the dataset.
    """
    self._builder = builder

    self._info_proto = dataset_info_pb2.DatasetInfo(
        name=builder.name,
        description=utils.dedent(description),
        version=str(builder._version),  # pylint: disable=protected-access
        citation=utils.dedent(citation),
        redistribution_info=dataset_info_pb2.RedistributionInfo(
            license=utils.dedent(redistribution_info.pop("license")),
            **redistribution_info) if redistribution_info else None)

    if homepage:
      self._info_proto.location.urls[:] = [homepage]

    if features:
      if not isinstance(features, top_level_feature.TopLevelFeature):
        raise ValueError(
            "DatasetInfo.features only supports FeaturesDict or Sequence at "
            "the top-level. Got {}".format(features))
      features._set_top_level()  # pylint: disable=protected-access
    self._features = features
    self._splits = splits_lib.SplitDict(self._builder.name)
    if supervised_keys is not None:
      assert isinstance(supervised_keys, tuple)
      assert len(supervised_keys) == 2
      self._info_proto.supervised_keys.input = supervised_keys[0]
      self._info_proto.supervised_keys.output = supervised_keys[1]

    if metadata and not isinstance(metadata, Metadata):
      raise ValueError(
          "Metadata should be a `tfds.core.Metadata` instance. Received "
          "{}".format(metadata))
    self._metadata = metadata

    # Is this object initialized with both the static and the dynamic data?
    self._fully_initialized = False
示例#4
0
    def __init__(
            self,
            *,
            builder: Union[DatasetIdentity, Any],
            description: Optional[str] = None,
            features: Optional[feature_lib.FeatureConnector] = None,
            supervised_keys: Optional[SupervisedKeysType] = None,
            disable_shuffling: bool = False,
            homepage: Optional[str] = None,
            citation: Optional[str] = None,
            metadata: Optional[Metadata] = None,
            license: Optional[str] = None,  # pylint: disable=redefined-builtin
            redistribution_info: Optional[Dict[str, str]] = None,
            split_dict: Optional[splits_lib.SplitDict] = None):
        # pyformat: disable
        """Constructs DatasetInfo.

    Args:
      builder: `DatasetBuilder` or `DatasetIdentity`. The dataset builder or
        identity will be used to populate this info.
      description: `str`, description of this dataset.
      features: `tfds.features.FeaturesDict`, Information on the feature dict of
        the `tf.data.Dataset()` object from the `builder.as_dataset()` method.
      supervised_keys: Specifies the input structure for supervised learning, if
        applicable for the dataset, used with "as_supervised". The keys
        correspond to the feature names to select in `info.features`. When
        calling `tfds.core.DatasetBuilder.as_dataset()` with
        `as_supervised=True`, the `tf.data.Dataset` object will yield the
        structure defined by the keys passed here, instead of that defined by
        the `features` argument. Typically this is a `(input_key, target_key)`
        tuple, and the dataset yields a tuple of tensors `(input, target)`
        tensors.

        To yield a more complex structure, pass a tuple of `tf.nest` compatible
        structures of feature keys. The resulting `Dataset` will yield
        structures with each key replaced by the coresponding tensor. For
        example, passing a triple of keys would return a dataset
        that yields `(feature, target, sample_weights)` triples for keras.
        Using `supervised_keys=({'a':'a','b':'b'}, 'c')` would create a dataset
        yielding a tuple with a dictionary of features in the `features`
        position.

        Note that selecting features in nested `tfds.features.FeaturesDict`
        objects is not supported.
      disable_shuffling: `bool`, specify whether to shuffle the examples.
      homepage: `str`, optional, the homepage for this dataset.
      citation: `str`, optional, the citation to use for this dataset.
      metadata: `tfds.core.Metadata`, additonal object which will be
        stored/restored with the dataset. This allows for storing additional
        information with the dataset.
      license: license of the dataset.
      redistribution_info: information needed for redistribution, as specified
        in `dataset_info_pb2.RedistributionInfo`. The content of the `license`
        subfield will automatically be written to a LICENSE file stored with the
        dataset.
      split_dict: information about the splits in this dataset.
    """
        # pyformat: enable
        self._builder_or_identity = builder
        if isinstance(builder, DatasetIdentity):
            self._identity = builder
        else:
            self._identity = DatasetIdentity.from_builder(builder)

        self._info_proto = dataset_info_pb2.DatasetInfo(
            name=self._identity.name,
            description=utils.dedent(description),
            version=str(self._identity.version),
            release_notes=self._identity.release_notes,
            disable_shuffling=disable_shuffling,
            config_name=self._identity.config_name,
            config_description=self._identity.config_description,
            citation=utils.dedent(citation),
            module_name=self._identity.module_name,
            redistribution_info=dataset_info_pb2.RedistributionInfo(
                license=utils.dedent(license
                                     or redistribution_info.pop("license")),
                **redistribution_info) if redistribution_info else None)

        if homepage:
            self._info_proto.location.urls[:] = [homepage]

        if features:
            if not isinstance(features, top_level_feature.TopLevelFeature):
                raise ValueError(
                    "DatasetInfo.features only supports FeaturesDict or Sequence at "
                    "the top-level. Got {}".format(features))
        self._features = features
        self._splits = splits_lib.SplitDict([])
        if split_dict:
            self.set_splits(split_dict)
        if supervised_keys is not None:
            self._info_proto.supervised_keys.CopyFrom(
                _supervised_keys_to_proto(supervised_keys))

        if metadata and not isinstance(metadata, Metadata):
            raise ValueError(
                "Metadata should be a `tfds.core.Metadata` instance. Received "
                "{}".format(metadata))
        self._metadata = metadata

        # Is this object initialized with both the static and the dynamic data?
        self._fully_initialized = False