def main(_): dataset_name = FLAGS.dataset dataset_type = FLAGS.type root_dir = FLAGS.tfds_dir if not root_dir: root_dir = py_utils.tfds_dir() data = dict( dataset_name=dataset_name, dataset_type=dataset_type, dataset_cls=naming.snake_to_camelcase(dataset_name), TODO='TODO({})'.format(dataset_name), ) create_dataset_file(root_dir, data) add_the_init(root_dir, data) create_dataset_test_file(root_dir, data) create_fake_data(root_dir, data) create_checksum_file(root_dir, data) print( 'Dataset generated in {}\n' 'You can start with searching TODO({}).\n' 'Please check this ' '`https://github.com/tensorflow/datasets/blob/master/docs/add_dataset.md`' 'for details.'.format(root_dir, dataset_name))
def __post_init__(self): self.cls_name = naming.snake_to_camelcase(self.name) self.tfds_api = ('tensorflow_datasets.public_api' if self.in_tfds else 'tensorflow_datasets') self.todo = f'TODO({self.name})' if self.in_tfds: # `/path/to/tensorflow_datasets/image/my_dataset` # ->`tensorflow_datasets.image.my_dataset` import_parts = itertools.dropwhile( lambda p: p != 'tensorflow_datasets', self.path.parts) ds_import = '.'.join(import_parts) else: # For external datasets, it's difficult to correctly infer the full # `from my_module.path.datasets.my_dataset import MyDataset`. # Could try to auto-infer the absolute import path from the `setup.py`. # Instead uses relative import for now: `from . import my_dataset` ds_import = '.' self.ds_import = ds_import
def test_snake_to_camelcase(self, camel, snake): self.assertEqual(naming.snake_to_camelcase(snake), camel) # camelcase_to_snakecase is a no-op if the name is already snake_case. self.assertEqual(naming.camelcase_to_snakecase(snake), snake)
def test_snake_to_camelcase(self, camel, snake): self.assertEqual(naming.snake_to_camelcase(snake), camel)
def write_metadata( *, data_dir: epath.PathLike, features: features_lib.feature.FeatureConnectorArg, split_infos: Union[None, epath.PathLike, List[split_lib.SplitInfo]] = None, version: Union[None, str, utils.Version] = None, check_data: bool = True, **ds_info_kwargs, ) -> None: """Add metadata required to load with TFDS. See documentation for usage: https://www.tensorflow.org/datasets/external_tfrecord Args: data_dir: Dataset path on which to save the metadata features: dict of `tfds.features.FeatureConnector` matching the proto specs. split_infos: Can be either: * A path to the pre-computed split info values ( the `out_dir` kwarg of `tfds.folder_dataset.compute_split_info`) * A list of `tfds.core.SplitInfo` (returned value of `tfds.folder_dataset.compute_split_info`) * `None` to auto-compute the split info. version: Optional dataset version (auto-infer by default, or fallback to 1.0.0) check_data: If True, perform additional check to validate the data in data_dir is valid **ds_info_kwargs: Additional metadata forwarded to `tfds.core.DatasetInfo` ( description, homepage,...). Will appear in the doc. """ features = features_lib.features_dict.to_feature(features) data_dir = epath.Path(data_dir) # Extract the tf-record filenames tfrecord_files = [ f for f in data_dir.iterdir() if naming.FilenameInfo.is_valid(f.name) ] if not tfrecord_files: raise ValueError( f'Could not find tf-record (or compatible format) in {data_dir}. ' 'Make sure to follow the pattern: ' '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`') file_infos = [naming.FilenameInfo.from_str(f.name) for f in tfrecord_files] # Use set with tuple expansion syntax to ensure all names are consistents snake_name, = {f.dataset_name for f in file_infos} camel_name = naming.snake_to_camelcase(snake_name) filetype_suffix, = {f.filetype_suffix for f in file_infos} file_format = file_adapters.file_format_from_suffix(filetype_suffix) cls = types.new_class( camel_name, bases=(_WriteBuilder, ), kwds=dict(skip_registration=True), exec_body=None, ) if version is None: # Automatically detect the version if utils.Version.is_valid(data_dir.name): version = data_dir.name else: version = '1.0.0' cls.VERSION = utils.Version(version) # Create a dummy builder (use non existant folder to make sure # dataset_info.json is not restored) builder = cls(file_format=file_format, data_dir='/tmp/non-existent-dir/') # Create the metadata ds_info = dataset_info.DatasetInfo( builder=builder, features=features, **ds_info_kwargs, ) ds_info.set_file_format(file_format) # Add the split infos split_dict = _load_splits( data_dir=data_dir, split_infos=split_infos, file_infos=file_infos, filetype_suffix=filetype_suffix, builder=builder, ) ds_info.set_splits(split_dict) # Save all metadata (dataset_info.json, features.json,...) ds_info.write_to_directory(data_dir) # Make sure that the data can be loaded (feature connector match the actual # specs) if check_data: utils.print_notebook( 'Metadata written. Testing by reading first example. ' 'Set check_data=False to skip.') builder = read_only_builder.builder_from_directory(data_dir) split_name = next(iter(builder.info.splits)) _, = builder.as_dataset( split=f'{split_name}[:1]') # Load the first example