def test_DataFilesDict_from_hf_repo_with_base_path(hub_dataset_info, pattern, size, base_path, split_name): if size > 0: data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info, base_path=base_path) assert len(data_files[split_name]) == size else: with pytest.raises(FileNotFoundError): data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info, base_path=base_path)
def data_files_with_two_splits_and_metadata(tmp_path, image_file): data_dir = tmp_path / "imagefolder_data_dir_with_metadata" data_dir.mkdir(parents=True, exist_ok=True) train_dir = data_dir / "train" train_dir.mkdir(parents=True, exist_ok=True) test_dir = data_dir / "test" test_dir.mkdir(parents=True, exist_ok=True) image_filename = train_dir / "image_rgb.jpg" # train image shutil.copyfile(image_file, image_filename) image_filename2 = train_dir / "image_rgb2.jpg" # train image shutil.copyfile(image_file, image_filename2) image_filename3 = test_dir / "image_rgb3.jpg" # test image shutil.copyfile(image_file, image_filename3) train_image_metadata_filename = train_dir / "metadata.jsonl" image_metadata = textwrap.dedent("""\ {"file_name": "image_rgb.jpg", "caption": "Nice train image"} {"file_name": "image_rgb2.jpg", "caption": "Nice second train image"} """) with open(train_image_metadata_filename, "w", encoding="utf-8") as f: f.write(image_metadata) train_image_metadata_filename = test_dir / "metadata.jsonl" image_metadata = textwrap.dedent("""\ {"file_name": "image_rgb3.jpg", "caption": "Nice test image"} """) with open(train_image_metadata_filename, "w", encoding="utf-8") as f: f.write(image_metadata) data_files_with_two_splits_and_metadata = DataFilesDict.from_local_or_remote( get_patterns_locally(data_dir), data_dir) assert len(data_files_with_two_splits_and_metadata) == 2 assert len(data_files_with_two_splits_and_metadata["train"]) == 3 assert len(data_files_with_two_splits_and_metadata["test"]) == 2 return data_files_with_two_splits_and_metadata
def data_files_with_one_split_and_metadata(tmp_path, image_file): data_dir = tmp_path / "imagefolder_data_dir_with_metadata" data_dir.mkdir(parents=True, exist_ok=True) subdir = data_dir / "subdir" subdir.mkdir(parents=True, exist_ok=True) image_filename = data_dir / "image_rgb.jpg" shutil.copyfile(image_file, image_filename) image_filename2 = data_dir / "image_rgb2.jpg" shutil.copyfile(image_file, image_filename2) image_filename3 = subdir / "image_rgb3.jpg" # in subdir shutil.copyfile(image_file, image_filename3) image_metadata_filename = data_dir / "metadata.jsonl" image_metadata = textwrap.dedent("""\ {"file_name": "image_rgb.jpg", "caption": "Nice image"} {"file_name": "image_rgb2.jpg", "caption": "Nice second image"} {"file_name": "subdir/image_rgb3.jpg", "caption": "Nice third image"} """) with open(image_metadata_filename, "w", encoding="utf-8") as f: f.write(image_metadata) data_files_with_one_split_and_metadata = DataFilesDict.from_local_or_remote( get_patterns_locally(data_dir), data_dir) assert len(data_files_with_one_split_and_metadata) == 1 assert len(data_files_with_one_split_and_metadata["train"]) == 4 return data_files_with_one_split_and_metadata
def data_files_with_zip_archives(tmp_path, image_file): from PIL import Image, ImageOps data_dir = tmp_path / "imagefolder_data_dir_with_zip_archives" data_dir.mkdir(parents=True, exist_ok=True) archive_dir = data_dir / "archive" archive_dir.mkdir(parents=True, exist_ok=True) subdir = archive_dir / "subdir" subdir.mkdir(parents=True, exist_ok=True) image_filename = archive_dir / "image_rgb.jpg" shutil.copyfile(image_file, image_filename) image_filename2 = subdir / "image_rgb2.jpg" # in subdir # make sure they're two different images # Indeed we won't be able to compare the image.filename, since the archive is not extracted in streaming mode ImageOps.flip(Image.open(image_file)).save(image_filename2) image_metadata_filename = archive_dir / "metadata.jsonl" image_metadata = textwrap.dedent("""\ {"file_name": "image_rgb.jpg", "caption": "Nice image"} {"file_name": "subdir/image_rgb2.jpg", "caption": "Nice second image"} """) with open(image_metadata_filename, "w", encoding="utf-8") as f: f.write(image_metadata) shutil.make_archive(archive_dir, "zip", archive_dir) shutil.rmtree(str(archive_dir)) data_files_with_zip_archives = DataFilesDict.from_local_or_remote( get_patterns_locally(data_dir), data_dir) assert len(data_files_with_zip_archives) == 1 assert len(data_files_with_zip_archives["train"]) == 1 return data_files_with_zip_archives
def test_DataFilesDict_from_local_or_remote(complex_data_dir, pattern_results, pattern): split_name = "train" try: data_files = DataFilesDict.from_local_or_remote({split_name: [pattern]}, complex_data_dir) assert all(isinstance(data_files_list, DataFilesList) for data_files_list in data_files.values()) assert sorted(str(f) for f in data_files[split_name]) == pattern_results[pattern] assert all(isinstance(url, Path) for url in data_files[split_name]) except FileNotFoundError: assert len(pattern_results[pattern]) == 0
def test_DataFilesDict_from_hf_repo(hub_dataset_info, hub_dataset_info_patterns_results, pattern): split_name = "train" try: data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info) assert all(isinstance(data_files_list, DataFilesList) for data_files_list in data_files.values()) assert sorted(str(f) for f in data_files[split_name]) == hub_dataset_info_patterns_results[pattern] assert all(isinstance(url, Url) for url in data_files[split_name]) except FileNotFoundError: assert len(hub_dataset_info_patterns_results[pattern]) == 0
def test_data_files_with_wrong_image_file_name_column_in_metadata_file( cache_dir, tmp_path, image_file): data_dir = tmp_path / "data_dir_with_bad_metadata" data_dir.mkdir(parents=True, exist_ok=True) shutil.copyfile(image_file, data_dir / "image_rgb.jpg") image_metadata_filename = data_dir / "metadata.jsonl" image_metadata = textwrap.dedent( # with bad column "bad_file_name" instead of "file_name" """\ {"bad_file_name": "image_rgb.jpg", "caption": "Nice image"} """) with open(image_metadata_filename, "w", encoding="utf-8") as f: f.write(image_metadata) data_files_with_bad_metadata = DataFilesDict.from_local_or_remote( get_patterns_locally(data_dir), data_dir) imagefolder = ImageFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir) with pytest.raises(ValueError) as exc_info: imagefolder.download_and_prepare() assert "`file_name` must be present" in str(exc_info.value)
def test_data_files_with_wrong_metadata_file_name(cache_dir, tmp_path, image_file): data_dir = tmp_path / "data_dir_with_bad_metadata" data_dir.mkdir(parents=True, exist_ok=True) shutil.copyfile(image_file, data_dir / "image_rgb.jpg") image_metadata_filename = data_dir / "bad_metadata.jsonl" # bad file image_metadata = textwrap.dedent("""\ {"file_name": "image_rgb.jpg", "caption": "Nice image"} """) with open(image_metadata_filename, "w", encoding="utf-8") as f: f.write(image_metadata) data_files_with_bad_metadata = DataFilesDict.from_local_or_remote( get_patterns_locally(data_dir), data_dir) imagefolder = ImageFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir) imagefolder.download_and_prepare() dataset = imagefolder.as_dataset(split="train") # check that there are no metadata, since the metadata file name doesn't have the right name assert "caption" not in dataset.column_names
def test_DataFilesDict_from_hf_repo_hashing(hub_dataset_info): patterns = {"train": ["**/train.txt"], "test": ["**/test.txt"]} data_files1 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True)) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) patterns2 = {"train": ["data/train.txt"], "test": ["data/test.txt"]} data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) patterns2 = {"train": ["data/train.txt"], "test": ["data/train.txt"]} data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch.object(hub_dataset_info, "id", "blabla"): data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch.object(hub_dataset_info, "sha", "blabla"): data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info) assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file): patterns = {"train": [_TEST_URL], "test": [str(text_file)]} data_files1 = DataFilesDict.from_local_or_remote(patterns) data_files2 = DataFilesDict.from_local_or_remote(patterns) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True)) assert Hasher.hash(data_files1) == Hasher.hash(data_files2) patterns2 = {"train": [_TEST_URL], "test": [_TEST_URL]} data_files2 = DataFilesDict.from_local_or_remote(patterns2) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch("datasets.data_files.request_etag") as mock_request_etag: mock_request_etag.return_value = "blabla" data_files2 = DataFilesDict.from_local_or_remote(patterns) assert Hasher.hash(data_files1) != Hasher.hash(data_files2) with patch("datasets.data_files.os.path.getmtime") as mock_getmtime: mock_getmtime.return_value = 123 data_files2 = DataFilesDict.from_local_or_remote(patterns) assert Hasher.hash(data_files1) != Hasher.hash(data_files2)