def _get_builder_names_single_namespace(
     ns_name: str,
     data_dir: utils.ReadOnlyPath,
 ) -> List[str]:
     # Note: `data_dir` might contain non-dataset folders, but checking
     # individual dataset would have significant performance drop, so
     # this is an acceptable trade-of.
     return [
         str(utils.DatasetName(namespace=ns_name, name=builder_dir.name))
         for builder_dir in _maybe_iterdir(data_dir)
         if _is_valid_dataset_name(builder_dir.name)
     ]
示例#2
0
def test_builder_cls(dummy_register):  # pylint: disable=redefined-outer-name

    # The dataset will be installed in the cache
    installed_path = cache.cache_path()
    installed_path /= 'modules/tfds_community/kaggle/dummy_dataset'
    assert not installed_path.exists()

    ds_name = utils.DatasetName('kaggle:dummy_dataset')
    builder_cls = dummy_register.builder_cls(ds_name)
    assert builder_cls.name == 'dummy_dataset'

    clshash = '1de59094bbe913e9a95aa0cff6f46bc06d813bd5c288eac34950b473e4ef199c'
    assert installed_path / f'{clshash}/dummy_dataset.py' == builder_cls.code_path
    assert 'kaggle' in builder_cls.code_path.parts
    assert issubclass(builder_cls, dataset_builder.DatasetBuilder)
    assert not builder_cls.url_infos  # No checksums installed with the package

    # Dataset installed in the cache
    # Filename should be deterministic
    assert list(sorted(installed_path.iterdir())) == [installed_path / clshash]

    # Reusing the dataset should re-use the cache
    with mock.patch.object(
            register_package,
            '_download_and_cache',
            side_effect=ValueError('Dataset should have been cached already')):
        ds_name = utils.DatasetName('kaggle:dummy_dataset')
        builder_cls2 = dummy_register.builder_cls(ds_name)
    assert builder_cls is builder_cls2

    # Datasets from different namespace can have the same name
    ds_name = utils.DatasetName('mlds:dummy_dataset')
    builder_cls = dummy_register.builder_cls(ds_name)
    assert 'mlds' in builder_cls.code_path.parts
    assert issubclass(builder_cls, dataset_builder.DatasetBuilder)
    # Checksums have been correctly installed
    assert 'http://dummy.org/data.txt' in builder_cls.url_infos

    with pytest.raises(registered.DatasetNotFoundError):
        dummy_register.builder(utils.DatasetName('other:ds0'))
def test_dataset_package():
    """Exports/imports operation should be identity."""
    pkg = register_package._DatasetPackage(
        name=utils.DatasetName('ns:ds'),
        source='github://...',
    )
    assert register_package._DatasetPackage.from_json(pkg.to_json()) == pkg

    pkg2 = register_package._InstalledPackage(
        package=pkg,
        filestem='dummy_dataset',
        instalation_date=datetime.datetime.now(),
        hash='asdajhdadsadsad',
    )
    assert register_package._InstalledPackage.from_json(pkg2.to_json()) == pkg2
示例#4
0
def test_dataset_package():
    """Exports/imports operation should be identity."""
    pkg = register_package.DatasetPackage(
        name=utils.DatasetName('ns:ds'),
        source=dataset_sources.DatasetSource.from_json(
            'github://<owner>/<name>/tree/<branch>/my_ds/ds.py', ),
    )
    assert register_package.DatasetPackage.from_json(pkg.to_json()) == pkg

    pkg2 = register_package._InstalledPackage(
        package=pkg,
        instalation_date=datetime.datetime.now(),
        hash='asdajhdadsadsad',
    )
    assert register_package._InstalledPackage.from_json(pkg2.to_json()) == pkg2
示例#5
0
def _iter_builder_names(
    ns2data_dir: Dict[str, utils.ReadOnlyPath], ) -> Iterator[str]:
    """Yields the `ns:name` dataset names."""
    FILTERED_DIRNAME = frozenset(('downloads', ))  # pylint: disable=invalid-name
    # For better performances, could try to load all namespaces asynchonously
    for ns_name, data_dir in ns2data_dir.items():
        # Note: `data_dir` might contain non-dataset folders, but checking
        # individual dataset would have significant performance drop, so
        # this is an acceptable trade-of.
        for builder_dir in _maybe_iterdir(data_dir):
            if builder_dir.name in FILTERED_DIRNAME:
                continue
            if not naming.is_valid_dataset_name(builder_dir.name):
                continue
            yield str(
                utils.DatasetName(namespace=ns_name, name=builder_dir.name))
示例#6
0
def test_register_builder(dummy_register):  # pylint: disable=redefined-outer-name
    builder = dummy_register.builder(utils.DatasetName('kaggle:ds0'))
    assert 'kaggle' in builder.data_path.parts

    # Same dataset name can be loaded from different namespace
    builder = dummy_register.builder(utils.DatasetName('mlds:ds0'))
    assert 'mlds' in builder.data_path.parts

    builder = dummy_register.builder(
        utils.DatasetName('mlds:ds0'),
        data_dir=None,  # data_dir can be passed only if None
        version='1.0.0',
    )
    assert 'mlds' in builder.data_path.parts

    with pytest.raises(ValueError, match='`data_dir` cannot be set for'):
        dummy_register.builder(utils.DatasetName('mlds:ds0'),
                               data_dir='/path/to/data_dir')

    with pytest.raises(KeyError, match='Namespace .* not found.'):
        dummy_register.builder(utils.DatasetName('non-existing-namespace:ds0'))

    with pytest.raises(registered.DatasetNotFoundError):
        dummy_register.builder(utils.DatasetName('other:ds0'))
 def from_json(cls, data: utils.Json) -> 'DatasetPackage':
     """Factory which creates the cls from json."""
     return cls(
         name=utils.DatasetName(data['name']),
         source=dataset_sources_lib.DatasetSource.from_json(data['source']),
     )