示例#1
0
def hash_stored_photos(
    photos: Iterable[PhotoFile],
    directory: Union[str, PathLike],
    hash_algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
    storage_type: str = "HDD",
) -> dict[str, str]:
    """
    Checks the hashes of stored PhotoFiles

    :param directory the photo storage directory
    :param photos: PhotoFiles to remove
    :param hash_algorithm: the HashAlgorithm to use
    :param storage_type: the type of media the photos are stored on
            (uses async if SSD or RAID)
    :return: A dict from filepath (absolute) to checksum
    """
    directory = Path(directory).expanduser().resolve()
    if storage_type in ("SSD", "RAID"):
        async_hashes = True
    else:
        # concurrent reads of sequential files can lead to thrashing
        async_hashes = False
    files, sizes = [], []
    for photo in photos:
        abs_store_path = directory / photo.sto
        if abs_store_path.exists():
            files.append(str(abs_store_path))
            sizes.append(photo.fsz)
    checksum_cache = AsyncFileHasher(
        algorithm=hash_algorithm,
        use_async=async_hashes,
    ).check_files(files, pbar_unit="B", file_sizes=sizes)
    return checksum_cache
示例#2
0
def test_async_file_hasher(
    tmpdir, caplog, hasher_kwargs, check_kwargs: Dict[str, Union[str, list]]
):
    """
    AsyncFileHasher returns the correct checksums and skips nonexistent files
    """
    caplog.set_level(logging.DEBUG)
    files, sizes = [], []
    for i, (s, c) in enumerate(checksums):
        filename = tmpdir / f"{i}.bin"
        with open(filename, "wb") as f:
            f.write(s)
        files.append(filename)
        sizes.append(len(s))
    files.append(tmpdir / "asdf.bin")
    sizes.append(0)
    check_kwargs.setdefault("file_sizes", sizes)
    checksum_cache = AsyncFileHasher(**hasher_kwargs).check_files(files, **check_kwargs)
    print(checksum_cache)
    assert len(checksum_cache) == len(checksums)
    for i, (s, c) in enumerate(checksums):
        filename = tmpdir / f"{i}.bin"
        assert filename in checksum_cache
        assert checksum_cache[filename] == c
    assert (tmpdir / "asdf.bin") not in checksum_cache
示例#3
0
def test_async_file_hasher_command_available(cmd):
    """
    AsyncFileHasher.cmd_available returns True for existent hash commands
    and False for nonexistent functions
    """
    assert (
        AsyncFileHasher.cmd_available(cmd[0]) == cmd[1]
    ), f"{cmd[0]}{' not' if cmd[1] else ''} available"
示例#4
0
def test_async_checksum_path(checksum, tmpdir):
    files = [tmpdir / "test.bin"]
    with open(files[0], "wb") as f:
        f.write(checksum["bytes"])
    checksum_cache = AsyncFileHasher(algorithm=checksum["algorithm"],
                                     use_async=True).check_files(files,
                                                                 pbar_unit="B")
    print(checksum_cache)
    assert len(checksum_cache) == len(files)
    assert files[0] in checksum_cache
    assert checksum_cache[files[0]] == checksum["checksum"]
示例#5
0
def test_async_file_hasher_interrupt(monkeypatch):
    async def nop_cse(*_, **__):
        loop = asyncio.events.get_event_loop()
        loop.set_debug(True)
        return AsyncNopProcess(b"checksum img.jpg\n", b"", final_delay=5)

    monkeypatch.setattr(subprocess_async, "create_subprocess_exec", nop_cse)
    hasher = AsyncFileHasher(
        algorithm=HashAlgorithm.BLAKE2B_256,
        use_async=True,
        batch_size=10,
    )

    async def join(_=None):
        await asyncio.sleep(0.01)
        hasher.terminate()

    monkeypatch.setattr(asyncio.Queue, "join", join)
    all_jobs = [FileHasherJob(file_paths=[b"img.jpg"])]
    checksum_cache = asyncio.run(hasher.execute_queue(all_jobs=all_jobs))
    print(checksum_cache)
    assert len(checksum_cache) == 0
示例#6
0
def test_file_hasher(tmpdir):
    files = []
    for i, (s, c) in enumerate(checksums):
        filename = tmpdir / f"{i}.bin"
        with open(filename, "wb") as f:
            f.write(s)
        files.append(filename)
    checksum_cache = AsyncFileHasher(
        algorithm=HashAlgorithm.BLAKE2B_256, use_async=False
    ).check_files(files, pbar_unit="B")
    print(checksum_cache)
    assert len(checksum_cache) == len(checksums)
    for i, (s, c) in enumerate(checksums):
        filename = tmpdir / f"{i}.bin"
        assert filename in checksum_cache
        assert checksum_cache[filename] == c
示例#7
0
def test_async_file_hasher_empty(monkeypatch, caplog):
    async def nop_cse(*_, **__):
        loop = asyncio.events.get_event_loop()
        loop.set_debug(True)
        return AsyncNopProcess(b"\n", b"")

    monkeypatch.setattr(subprocess_async, "create_subprocess_exec", nop_cse)
    caplog.set_level(logging.DEBUG)
    checksum_cache = AsyncFileHasher(
        algorithm=HashAlgorithm.BLAKE2B_256,
        use_async=True,
        batch_size=10,
    ).check_files(["asdf.bin"], pbar_unit="it")
    print([(r.levelname, r) for r in caplog.records])
    print(checksum_cache)
    assert not any(record.levelname == "WARNING" for record in caplog.records)
    assert len(checksum_cache) == 0
示例#8
0
def index_photos(
    files: Iterable[Union[str, PathLike]],
    priority: int = 10,
    hash_algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
    tz_default: Optional[tzinfo] = None,
    storage_type: str = "HDD",
) -> list[Optional[PhotoFile]]:
    """
    Indexes photo files

    :param files: the photo file paths to index
    :param priority: the photos' priority
    :param hash_algorithm: The hashing algorithm to use for file checksums
    :param tz_default: The time zone to use if none is set
        (defaults to local time)
    :param storage_type: the storage type being indexed (uses more async if SSD)
    :return: a list of PhotoFiles, with None entries for errors
    """
    logger = logging.getLogger(__name__)
    if storage_type in ("SSD", "RAID"):
        async_hashes = True
        async_exif = cpu_count()
    else:
        # concurrent reads of sequential files can lead to thrashing
        async_hashes = False
        # exiftool is partially CPU-bound and benefits from async
        async_exif = min(4, cpu_count())
    logger.info("Collecting media hashes")
    checksum_cache = AsyncFileHasher(algorithm=hash_algorithm,
                                     use_async=async_hashes).check_files(
                                         files, pbar_unit="B")
    logger.info("Collecting media dates and times")
    datetime_cache = AsyncExifTool(
        num_workers=async_exif).get_best_datetime_batch(files)

    logger.info("Indexing media")
    photos = []
    exiftool = ExifTool()
    exiftool.start()
    for current_file in tqdm(files):
        if logger.isEnabledFor(logging.DEBUG):
            tqdm.write(f"Indexing {current_file}")
        try:
            pf = PhotoFile.from_file_cached(
                current_file,
                checksum_cache=checksum_cache,
                datetime_cache=datetime_cache,
                algorithm=hash_algorithm,
                tz_default=tz_default,
                priority=priority,
            )
            photos.append(pf)
        except Exception as e:
            tqdm.write(f"Error indexing {current_file}", file=sys.stderr)
            tb_str = "".join(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))
            tqdm.write(tb_str, file=sys.stderr)
            photos.append(None)
    exiftool.terminate()
    return photos
示例#9
0
def test_async_file_hasher_bad_algorithm():
    with pytest.raises(HasherException):
        AsyncFileHasher(algorithm="md5")