def test_resolve_patterns_in_dataset_repository_sorted_files(): unsorted_names = ["0.txt", "2.txt", "3.txt"] siblings = [{"rfilename": name} for name in unsorted_names] datasets_infos = DatasetInfo(id="test_unsorted_files", siblings=siblings, sha="foobar") resolved_data_files = resolve_patterns_in_dataset_repository(datasets_infos, ["*"]) resolved_names = [os.path.basename(data_file) for data_file in resolved_data_files] assert resolved_names == sorted(unsorted_names)
def hub_dataset_info(complex_data_dir): return DatasetInfo( siblings=[{ "rfilename": path.relative_to(complex_data_dir).as_posix() } for path in Path(complex_data_dir).rglob("*") if path.is_file()], sha="foobarfoobar", id="foo", )
def test_fail_resolve_data_files_in_dataset_repository(complex_data_dir): dataset_info = DatasetInfo( siblings=[ {"rfilename": path.relative_to(complex_data_dir).as_posix()} for path in Path(complex_data_dir).rglob("*") if path.is_file() ] ) with pytest.raises(FileNotFoundError): _resolve_data_files_in_dataset_repository(dataset_info, "blablabla")
def test_resolve_data_files_in_dataset_repository_with_extensions(complex_data_dir, pattern, size, extensions): dataset_info = DatasetInfo( siblings=[ {"rfilename": path.relative_to(complex_data_dir).as_posix()} for path in Path(complex_data_dir).rglob("*") if path.is_file() ] ) if size > 0: resolved_data_files = _resolve_data_files_in_dataset_repository( dataset_info, pattern, allowed_extensions=extensions ) assert len(resolved_data_files) == size else: with pytest.raises(FileNotFoundError): resolved_data_files = _resolve_data_files_in_dataset_repository( dataset_info, pattern, allowed_extensions=extensions )
def test_resolve_data_files_in_dataset_repository(complex_data_dir, pattern, size): dataset_info = DatasetInfo( siblings=[ {"rfilename": path.relative_to(complex_data_dir).as_posix()} for path in Path(complex_data_dir).rglob("*") if path.is_file() ] ) resolved_data_files = _resolve_data_files_in_dataset_repository(dataset_info, pattern) files_to_ignore = {".dummy", "README.md"} expected_resolved_data_files = [ path.relative_to(complex_data_dir) for path in Path(complex_data_dir).rglob(pattern) if path.name not in files_to_ignore and path.is_file() ] assert len(resolved_data_files) == size assert sorted(resolved_data_files) == sorted(expected_resolved_data_files) assert all(isinstance(path, PurePath) for path in resolved_data_files) assert all((Path(complex_data_dir) / path).is_file() for path in resolved_data_files)