def test_bool(self): sd = splits.SplitDict("ds_name") self.assertFalse(sd) # Empty split is False sd.add(tfds.core.SplitInfo(name="train", num_shards=10)) self.assertTrue(sd) # Non-empty split is True
def __init__(self, *, builder, description=None, features=None, supervised_keys=None, homepage=None, citation=None, metadata=None, redistribution_info=None): """Constructs DatasetInfo. Args: builder: `DatasetBuilder`, dataset builder for this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: `tuple` of `(input_key, target_key)`, Specifies the input feature and the label for supervised learning, if applicable for the dataset. The keys correspond to the feature names to select in `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()` with `as_supervised=True`, the `tf.data.Dataset` object will yield the (input, target) defined here. homepage: `str`, optional, the homepage for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. redistribution_info: `dict`, optional, information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. """ self._builder = builder self._info_proto = dataset_info_pb2.DatasetInfo( name=builder.name, description=utils.dedent(description), version=str(builder._version), # pylint: disable=protected-access citation=utils.dedent(citation), redistribution_info=dataset_info_pb2.RedistributionInfo( license=utils.dedent(redistribution_info.pop("license")), **redistribution_info) if redistribution_info else None) if homepage: self._info_proto.location.urls[:] = [homepage] if features: if not isinstance(features, top_level_feature.TopLevelFeature): raise ValueError( "DatasetInfo.features only supports FeaturesDict or Sequence at " "the top-level. Got {}".format(features)) features._set_top_level() # pylint: disable=protected-access self._features = features self._splits = splits_lib.SplitDict(self._builder.name) if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def test_bool(self): sd = splits.SplitDict([], dataset_name="ds_name") self.assertFalse(sd) # Empty split is False si = [tfds.core.SplitInfo(name="train", shard_lengths=[5], num_bytes=0)] sd = splits.SplitDict(si, dataset_name="ds_name") self.assertTrue(sd) # Non-empty split is True
def split_dict(self): sd = splits.SplitDict("ds_name") sd.add(tfds.core.SplitInfo(name="train", num_shards=10)) sd.add(tfds.core.SplitInfo(name="test", num_shards=1)) return sd
def test_empty_split(self): sd = splits.SplitDict([], dataset_name="ds_name") with self.assertRaisesWithPredicateMatch(KeyError, "`splits` is empty"): _ = sd["train"]
def test_num_shards(self): si = tfds.core.SplitInfo(name="train", shard_lengths=[1, 2, 3], num_bytes=0) sd = splits.SplitDict([si], dataset_name="ds_name") self.assertEqual(sd["train"].num_shards, 3)
def _download_and_prepare( self, dl_manager: download.DownloadManager, download_config: download.DownloadConfig, ) -> None: """Generate all splits and returns the computed split infos.""" split_builder = split_builder_lib.SplitBuilder( split_dict=self.info.splits, features=self.info.features, max_examples_per_split=download_config.max_examples_per_split, beam_options=download_config.beam_options, beam_runner=download_config.beam_runner, file_format=self._file_format, ) # Wrap the generation inside a context manager. # If `beam` is used during generation (when a pipeline gets created), # the context manager is equivalent to `with beam.Pipeline()`. # Otherwise, this is a no-op. # By auto-detecting Beam, the user only has to change `_generate_examples` # to go from non-beam to beam dataset: # https://www.tensorflow.org/datasets/beam_datasets#instructions with split_builder.maybe_beam_pipeline(): # If the signature has a `pipeline` kwargs, create the pipeline now and # forward it to `self._split_generators` # We add this magic because the pipeline kwargs is only used by c4 and # we do not want to make the API more verbose for a single advanced case. signature = inspect.signature(self._split_generators) if "pipeline" in signature.parameters.keys(): optional_pipeline_kwargs = dict(pipeline=split_builder.beam_pipeline) else: optional_pipeline_kwargs = {} split_generators = self._split_generators( # pylint: disable=unexpected-keyword-arg dl_manager, **optional_pipeline_kwargs ) # TODO(tfds): Could be removed once all datasets are migrated. # https://github.com/tensorflow/datasets/issues/2537 # Legacy mode (eventually convert list[SplitGeneratorLegacy] -> dict) split_generators = split_builder.normalize_legacy_split_generators( split_generators=split_generators, generator_fn=self._generate_examples, is_beam=isinstance(self, BeamBasedBuilder), ) # Ensure `all` isn't used as key. _check_split_names(split_generators.keys()) # Writer fail if the number of example yield is `0`, so we return here. if download_config.max_examples_per_split == 0: return # Start generating data for all splits path_suffix = file_adapters.ADAPTER_FOR_FORMAT[ self._file_format].FILE_SUFFIX split_info_futures = [ split_builder.submit_split_generation( # pylint: disable=g-complex-comprehension split_name=split_name, generator=generator, path=self.data_path / f"{self.name}-{split_name}.{path_suffix}", ) for split_name, generator in utils.tqdm( split_generators.items(), desc="Generating splits...", unit=" splits", leave=False, ) ] # Finalize the splits (after apache beam completed, if it was used) split_infos = [future.result() for future in split_info_futures] # Update the info object with the splits. split_dict = splits_lib.SplitDict(split_infos, dataset_name=self.name) self.info.set_splits(split_dict)
def __init__( self, *, builder: Union[DatasetIdentity, Any], description: Optional[str] = None, features: Optional[feature_lib.FeatureConnector] = None, supervised_keys: Optional[SupervisedKeysType] = None, disable_shuffling: bool = False, homepage: Optional[str] = None, citation: Optional[str] = None, metadata: Optional[Metadata] = None, license: Optional[str] = None, # pylint: disable=redefined-builtin redistribution_info: Optional[Dict[str, str]] = None, split_dict: Optional[splits_lib.SplitDict] = None): # pyformat: disable """Constructs DatasetInfo. Args: builder: `DatasetBuilder` or `DatasetIdentity`. The dataset builder or identity will be used to populate this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: Specifies the input structure for supervised learning, if applicable for the dataset, used with "as_supervised". The keys correspond to the feature names to select in `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()` with `as_supervised=True`, the `tf.data.Dataset` object will yield the structure defined by the keys passed here, instead of that defined by the `features` argument. Typically this is a `(input_key, target_key)` tuple, and the dataset yields a tuple of tensors `(input, target)` tensors. To yield a more complex structure, pass a tuple of `tf.nest` compatible structures of feature keys. The resulting `Dataset` will yield structures with each key replaced by the coresponding tensor. For example, passing a triple of keys would return a dataset that yields `(feature, target, sample_weights)` triples for keras. Using `supervised_keys=({'a':'a','b':'b'}, 'c')` would create a dataset yielding a tuple with a dictionary of features in the `features` position. Note that selecting features in nested `tfds.features.FeaturesDict` objects is not supported. disable_shuffling: `bool`, specify whether to shuffle the examples. homepage: `str`, optional, the homepage for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. license: license of the dataset. redistribution_info: information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. split_dict: information about the splits in this dataset. """ # pyformat: enable self._builder_or_identity = builder if isinstance(builder, DatasetIdentity): self._identity = builder else: self._identity = DatasetIdentity.from_builder(builder) self._info_proto = dataset_info_pb2.DatasetInfo( name=self._identity.name, description=utils.dedent(description), version=str(self._identity.version), release_notes=self._identity.release_notes, disable_shuffling=disable_shuffling, config_name=self._identity.config_name, config_description=self._identity.config_description, citation=utils.dedent(citation), module_name=self._identity.module_name, redistribution_info=dataset_info_pb2.RedistributionInfo( license=utils.dedent(license or redistribution_info.pop("license")), **redistribution_info) if redistribution_info else None) if homepage: self._info_proto.location.urls[:] = [homepage] if features: if not isinstance(features, top_level_feature.TopLevelFeature): raise ValueError( "DatasetInfo.features only supports FeaturesDict or Sequence at " "the top-level. Got {}".format(features)) self._features = features self._splits = splits_lib.SplitDict([]) if split_dict: self.set_splits(split_dict) if supervised_keys is not None: self._info_proto.supervised_keys.CopyFrom( _supervised_keys_to_proto(supervised_keys)) if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def _split_dict(splits: Dict[str, Split]) -> splits_lib.SplitDict: return splits_lib.SplitDict([split.info for _, split in splits.items()])
def split_dict(self): sd = splits.SplitDict() sd.add(SplitInfo(name="train", num_shards=10)) sd.add(SplitInfo(name="test", num_shards=1)) return sd
def test_empty_split(self): sd = splits.SplitDict([]) with self.assertRaisesWithPredicateMatch(KeyError, '`splits` is empty'): _ = sd['train']