def _get_builder_names_single_namespace( ns_name: str, data_dir: utils.ReadOnlyPath, ) -> List[str]: # Note: `data_dir` might contain non-dataset folders, but checking # individual dataset would have significant performance drop, so # this is an acceptable trade-of. return [ str(utils.DatasetName(namespace=ns_name, name=builder_dir.name)) for builder_dir in _maybe_iterdir(data_dir) if _is_valid_dataset_name(builder_dir.name) ]
def test_builder_cls(dummy_register): # pylint: disable=redefined-outer-name # The dataset will be installed in the cache installed_path = cache.cache_path() installed_path /= 'modules/tfds_community/kaggle/dummy_dataset' assert not installed_path.exists() ds_name = utils.DatasetName('kaggle:dummy_dataset') builder_cls = dummy_register.builder_cls(ds_name) assert builder_cls.name == 'dummy_dataset' clshash = '1de59094bbe913e9a95aa0cff6f46bc06d813bd5c288eac34950b473e4ef199c' assert installed_path / f'{clshash}/dummy_dataset.py' == builder_cls.code_path assert 'kaggle' in builder_cls.code_path.parts assert issubclass(builder_cls, dataset_builder.DatasetBuilder) assert not builder_cls.url_infos # No checksums installed with the package # Dataset installed in the cache # Filename should be deterministic assert list(sorted(installed_path.iterdir())) == [installed_path / clshash] # Reusing the dataset should re-use the cache with mock.patch.object( register_package, '_download_and_cache', side_effect=ValueError('Dataset should have been cached already')): ds_name = utils.DatasetName('kaggle:dummy_dataset') builder_cls2 = dummy_register.builder_cls(ds_name) assert builder_cls is builder_cls2 # Datasets from different namespace can have the same name ds_name = utils.DatasetName('mlds:dummy_dataset') builder_cls = dummy_register.builder_cls(ds_name) assert 'mlds' in builder_cls.code_path.parts assert issubclass(builder_cls, dataset_builder.DatasetBuilder) # Checksums have been correctly installed assert 'http://dummy.org/data.txt' in builder_cls.url_infos with pytest.raises(registered.DatasetNotFoundError): dummy_register.builder(utils.DatasetName('other:ds0'))
def test_dataset_package(): """Exports/imports operation should be identity.""" pkg = register_package._DatasetPackage( name=utils.DatasetName('ns:ds'), source='github://...', ) assert register_package._DatasetPackage.from_json(pkg.to_json()) == pkg pkg2 = register_package._InstalledPackage( package=pkg, filestem='dummy_dataset', instalation_date=datetime.datetime.now(), hash='asdajhdadsadsad', ) assert register_package._InstalledPackage.from_json(pkg2.to_json()) == pkg2
def test_dataset_package(): """Exports/imports operation should be identity.""" pkg = register_package.DatasetPackage( name=utils.DatasetName('ns:ds'), source=dataset_sources.DatasetSource.from_json( 'github://<owner>/<name>/tree/<branch>/my_ds/ds.py', ), ) assert register_package.DatasetPackage.from_json(pkg.to_json()) == pkg pkg2 = register_package._InstalledPackage( package=pkg, instalation_date=datetime.datetime.now(), hash='asdajhdadsadsad', ) assert register_package._InstalledPackage.from_json(pkg2.to_json()) == pkg2
def _iter_builder_names( ns2data_dir: Dict[str, utils.ReadOnlyPath], ) -> Iterator[str]: """Yields the `ns:name` dataset names.""" FILTERED_DIRNAME = frozenset(('downloads', )) # pylint: disable=invalid-name # For better performances, could try to load all namespaces asynchonously for ns_name, data_dir in ns2data_dir.items(): # Note: `data_dir` might contain non-dataset folders, but checking # individual dataset would have significant performance drop, so # this is an acceptable trade-of. for builder_dir in _maybe_iterdir(data_dir): if builder_dir.name in FILTERED_DIRNAME: continue if not naming.is_valid_dataset_name(builder_dir.name): continue yield str( utils.DatasetName(namespace=ns_name, name=builder_dir.name))
def test_register_builder(dummy_register): # pylint: disable=redefined-outer-name builder = dummy_register.builder(utils.DatasetName('kaggle:ds0')) assert 'kaggle' in builder.data_path.parts # Same dataset name can be loaded from different namespace builder = dummy_register.builder(utils.DatasetName('mlds:ds0')) assert 'mlds' in builder.data_path.parts builder = dummy_register.builder( utils.DatasetName('mlds:ds0'), data_dir=None, # data_dir can be passed only if None version='1.0.0', ) assert 'mlds' in builder.data_path.parts with pytest.raises(ValueError, match='`data_dir` cannot be set for'): dummy_register.builder(utils.DatasetName('mlds:ds0'), data_dir='/path/to/data_dir') with pytest.raises(KeyError, match='Namespace .* not found.'): dummy_register.builder(utils.DatasetName('non-existing-namespace:ds0')) with pytest.raises(registered.DatasetNotFoundError): dummy_register.builder(utils.DatasetName('other:ds0'))
def from_json(cls, data: utils.Json) -> 'DatasetPackage': """Factory which creates the cls from json.""" return cls( name=utils.DatasetName(data['name']), source=dataset_sources_lib.DatasetSource.from_json(data['source']), )