예제 #1
0
def test_DataFilesDict_from_hf_repo_with_base_path(hub_dataset_info, pattern, size, base_path, split_name):
    if size > 0:
        data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info, base_path=base_path)
        assert len(data_files[split_name]) == size
    else:
        with pytest.raises(FileNotFoundError):
            data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info, base_path=base_path)
예제 #2
0
def data_files_with_two_splits_and_metadata(tmp_path, image_file):
    data_dir = tmp_path / "imagefolder_data_dir_with_metadata"
    data_dir.mkdir(parents=True, exist_ok=True)
    train_dir = data_dir / "train"
    train_dir.mkdir(parents=True, exist_ok=True)
    test_dir = data_dir / "test"
    test_dir.mkdir(parents=True, exist_ok=True)

    image_filename = train_dir / "image_rgb.jpg"  # train image
    shutil.copyfile(image_file, image_filename)
    image_filename2 = train_dir / "image_rgb2.jpg"  # train image
    shutil.copyfile(image_file, image_filename2)
    image_filename3 = test_dir / "image_rgb3.jpg"  # test image
    shutil.copyfile(image_file, image_filename3)

    train_image_metadata_filename = train_dir / "metadata.jsonl"
    image_metadata = textwrap.dedent("""\
        {"file_name": "image_rgb.jpg", "caption": "Nice train image"}
        {"file_name": "image_rgb2.jpg", "caption": "Nice second train image"}
        """)
    with open(train_image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)
    train_image_metadata_filename = test_dir / "metadata.jsonl"
    image_metadata = textwrap.dedent("""\
        {"file_name": "image_rgb3.jpg", "caption": "Nice test image"}
        """)
    with open(train_image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)
    data_files_with_two_splits_and_metadata = DataFilesDict.from_local_or_remote(
        get_patterns_locally(data_dir), data_dir)
    assert len(data_files_with_two_splits_and_metadata) == 2
    assert len(data_files_with_two_splits_and_metadata["train"]) == 3
    assert len(data_files_with_two_splits_and_metadata["test"]) == 2
    return data_files_with_two_splits_and_metadata
예제 #3
0
def data_files_with_one_split_and_metadata(tmp_path, image_file):
    data_dir = tmp_path / "imagefolder_data_dir_with_metadata"
    data_dir.mkdir(parents=True, exist_ok=True)
    subdir = data_dir / "subdir"
    subdir.mkdir(parents=True, exist_ok=True)

    image_filename = data_dir / "image_rgb.jpg"
    shutil.copyfile(image_file, image_filename)
    image_filename2 = data_dir / "image_rgb2.jpg"
    shutil.copyfile(image_file, image_filename2)
    image_filename3 = subdir / "image_rgb3.jpg"  # in subdir
    shutil.copyfile(image_file, image_filename3)

    image_metadata_filename = data_dir / "metadata.jsonl"
    image_metadata = textwrap.dedent("""\
        {"file_name": "image_rgb.jpg", "caption": "Nice image"}
        {"file_name": "image_rgb2.jpg", "caption": "Nice second image"}
        {"file_name": "subdir/image_rgb3.jpg", "caption": "Nice third image"}
        """)
    with open(image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)
    data_files_with_one_split_and_metadata = DataFilesDict.from_local_or_remote(
        get_patterns_locally(data_dir), data_dir)
    assert len(data_files_with_one_split_and_metadata) == 1
    assert len(data_files_with_one_split_and_metadata["train"]) == 4
    return data_files_with_one_split_and_metadata
예제 #4
0
def data_files_with_zip_archives(tmp_path, image_file):
    from PIL import Image, ImageOps

    data_dir = tmp_path / "imagefolder_data_dir_with_zip_archives"
    data_dir.mkdir(parents=True, exist_ok=True)
    archive_dir = data_dir / "archive"
    archive_dir.mkdir(parents=True, exist_ok=True)
    subdir = archive_dir / "subdir"
    subdir.mkdir(parents=True, exist_ok=True)

    image_filename = archive_dir / "image_rgb.jpg"
    shutil.copyfile(image_file, image_filename)
    image_filename2 = subdir / "image_rgb2.jpg"  # in subdir
    # make sure they're two different images
    # Indeed we won't be able to compare the image.filename, since the archive is not extracted in streaming mode
    ImageOps.flip(Image.open(image_file)).save(image_filename2)

    image_metadata_filename = archive_dir / "metadata.jsonl"
    image_metadata = textwrap.dedent("""\
        {"file_name": "image_rgb.jpg", "caption": "Nice image"}
        {"file_name": "subdir/image_rgb2.jpg", "caption": "Nice second image"}
        """)
    with open(image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)

    shutil.make_archive(archive_dir, "zip", archive_dir)
    shutil.rmtree(str(archive_dir))

    data_files_with_zip_archives = DataFilesDict.from_local_or_remote(
        get_patterns_locally(data_dir), data_dir)

    assert len(data_files_with_zip_archives) == 1
    assert len(data_files_with_zip_archives["train"]) == 1
    return data_files_with_zip_archives
예제 #5
0
def test_DataFilesDict_from_local_or_remote(complex_data_dir, pattern_results, pattern):
    split_name = "train"
    try:
        data_files = DataFilesDict.from_local_or_remote({split_name: [pattern]}, complex_data_dir)
        assert all(isinstance(data_files_list, DataFilesList) for data_files_list in data_files.values())
        assert sorted(str(f) for f in data_files[split_name]) == pattern_results[pattern]
        assert all(isinstance(url, Path) for url in data_files[split_name])
    except FileNotFoundError:
        assert len(pattern_results[pattern]) == 0
예제 #6
0
def test_DataFilesDict_from_hf_repo(hub_dataset_info, hub_dataset_info_patterns_results, pattern):
    split_name = "train"
    try:
        data_files = DataFilesDict.from_hf_repo({split_name: [pattern]}, hub_dataset_info)
        assert all(isinstance(data_files_list, DataFilesList) for data_files_list in data_files.values())
        assert sorted(str(f) for f in data_files[split_name]) == hub_dataset_info_patterns_results[pattern]
        assert all(isinstance(url, Url) for url in data_files[split_name])
    except FileNotFoundError:
        assert len(hub_dataset_info_patterns_results[pattern]) == 0
예제 #7
0
def test_data_files_with_wrong_image_file_name_column_in_metadata_file(
        cache_dir, tmp_path, image_file):
    data_dir = tmp_path / "data_dir_with_bad_metadata"
    data_dir.mkdir(parents=True, exist_ok=True)
    shutil.copyfile(image_file, data_dir / "image_rgb.jpg")
    image_metadata_filename = data_dir / "metadata.jsonl"
    image_metadata = textwrap.dedent(  # with bad column "bad_file_name" instead of "file_name"
        """\
        {"bad_file_name": "image_rgb.jpg", "caption": "Nice image"}
        """)
    with open(image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)

    data_files_with_bad_metadata = DataFilesDict.from_local_or_remote(
        get_patterns_locally(data_dir), data_dir)
    imagefolder = ImageFolder(data_files=data_files_with_bad_metadata,
                              cache_dir=cache_dir)
    with pytest.raises(ValueError) as exc_info:
        imagefolder.download_and_prepare()
    assert "`file_name` must be present" in str(exc_info.value)
예제 #8
0
def test_data_files_with_wrong_metadata_file_name(cache_dir, tmp_path,
                                                  image_file):
    data_dir = tmp_path / "data_dir_with_bad_metadata"
    data_dir.mkdir(parents=True, exist_ok=True)
    shutil.copyfile(image_file, data_dir / "image_rgb.jpg")
    image_metadata_filename = data_dir / "bad_metadata.jsonl"  # bad file
    image_metadata = textwrap.dedent("""\
        {"file_name": "image_rgb.jpg", "caption": "Nice image"}
        """)
    with open(image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)

    data_files_with_bad_metadata = DataFilesDict.from_local_or_remote(
        get_patterns_locally(data_dir), data_dir)
    imagefolder = ImageFolder(data_files=data_files_with_bad_metadata,
                              cache_dir=cache_dir)
    imagefolder.download_and_prepare()
    dataset = imagefolder.as_dataset(split="train")
    # check that there are no metadata, since the metadata file name doesn't have the right name
    assert "caption" not in dataset.column_names
def test_DataFilesDict_from_hf_repo_hashing(hub_dataset_info):
    patterns = {"train": ["**/train.txt"], "test": ["**/test.txt"]}
    data_files1 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
    data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": ["data/train.txt"], "test": ["data/test.txt"]}
    data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": ["data/train.txt"], "test": ["data/train.txt"]}
    data_files2 = DataFilesDict.from_hf_repo(patterns2, hub_dataset_info)
    assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch.object(hub_dataset_info, "id", "blabla"):
        data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch.object(hub_dataset_info, "sha", "blabla"):
        data_files2 = DataFilesDict.from_hf_repo(patterns, hub_dataset_info)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
def test_DataFilesDict_from_hf_local_or_remote_hashing(text_file):
    patterns = {"train": [_TEST_URL], "test": [str(text_file)]}
    data_files1 = DataFilesDict.from_local_or_remote(patterns)
    data_files2 = DataFilesDict.from_local_or_remote(patterns)
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))
    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)

    patterns2 = {"train": [_TEST_URL], "test": [_TEST_URL]}
    data_files2 = DataFilesDict.from_local_or_remote(patterns2)
    assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch("datasets.data_files.request_etag") as mock_request_etag:
        mock_request_etag.return_value = "blabla"
        data_files2 = DataFilesDict.from_local_or_remote(patterns)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)

    with patch("datasets.data_files.os.path.getmtime") as mock_getmtime:
        mock_getmtime.return_value = 123
        data_files2 = DataFilesDict.from_local_or_remote(patterns)
        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)