def hash_stored_photos( photos: Iterable[PhotoFile], directory: Union[str, PathLike], hash_algorithm: HashAlgorithm = DEFAULT_HASH_ALGO, storage_type: str = "HDD", ) -> dict[str, str]: """ Checks the hashes of stored PhotoFiles :param directory the photo storage directory :param photos: PhotoFiles to remove :param hash_algorithm: the HashAlgorithm to use :param storage_type: the type of media the photos are stored on (uses async if SSD or RAID) :return: A dict from filepath (absolute) to checksum """ directory = Path(directory).expanduser().resolve() if storage_type in ("SSD", "RAID"): async_hashes = True else: # concurrent reads of sequential files can lead to thrashing async_hashes = False files, sizes = [], [] for photo in photos: abs_store_path = directory / photo.sto if abs_store_path.exists(): files.append(str(abs_store_path)) sizes.append(photo.fsz) checksum_cache = AsyncFileHasher( algorithm=hash_algorithm, use_async=async_hashes, ).check_files(files, pbar_unit="B", file_sizes=sizes) return checksum_cache
def test_async_file_hasher( tmpdir, caplog, hasher_kwargs, check_kwargs: Dict[str, Union[str, list]] ): """ AsyncFileHasher returns the correct checksums and skips nonexistent files """ caplog.set_level(logging.DEBUG) files, sizes = [], [] for i, (s, c) in enumerate(checksums): filename = tmpdir / f"{i}.bin" with open(filename, "wb") as f: f.write(s) files.append(filename) sizes.append(len(s)) files.append(tmpdir / "asdf.bin") sizes.append(0) check_kwargs.setdefault("file_sizes", sizes) checksum_cache = AsyncFileHasher(**hasher_kwargs).check_files(files, **check_kwargs) print(checksum_cache) assert len(checksum_cache) == len(checksums) for i, (s, c) in enumerate(checksums): filename = tmpdir / f"{i}.bin" assert filename in checksum_cache assert checksum_cache[filename] == c assert (tmpdir / "asdf.bin") not in checksum_cache
def test_async_file_hasher_command_available(cmd): """ AsyncFileHasher.cmd_available returns True for existent hash commands and False for nonexistent functions """ assert ( AsyncFileHasher.cmd_available(cmd[0]) == cmd[1] ), f"{cmd[0]}{' not' if cmd[1] else ''} available"
def test_async_checksum_path(checksum, tmpdir): files = [tmpdir / "test.bin"] with open(files[0], "wb") as f: f.write(checksum["bytes"]) checksum_cache = AsyncFileHasher(algorithm=checksum["algorithm"], use_async=True).check_files(files, pbar_unit="B") print(checksum_cache) assert len(checksum_cache) == len(files) assert files[0] in checksum_cache assert checksum_cache[files[0]] == checksum["checksum"]
def test_async_file_hasher_interrupt(monkeypatch): async def nop_cse(*_, **__): loop = asyncio.events.get_event_loop() loop.set_debug(True) return AsyncNopProcess(b"checksum img.jpg\n", b"", final_delay=5) monkeypatch.setattr(subprocess_async, "create_subprocess_exec", nop_cse) hasher = AsyncFileHasher( algorithm=HashAlgorithm.BLAKE2B_256, use_async=True, batch_size=10, ) async def join(_=None): await asyncio.sleep(0.01) hasher.terminate() monkeypatch.setattr(asyncio.Queue, "join", join) all_jobs = [FileHasherJob(file_paths=[b"img.jpg"])] checksum_cache = asyncio.run(hasher.execute_queue(all_jobs=all_jobs)) print(checksum_cache) assert len(checksum_cache) == 0
def test_file_hasher(tmpdir): files = [] for i, (s, c) in enumerate(checksums): filename = tmpdir / f"{i}.bin" with open(filename, "wb") as f: f.write(s) files.append(filename) checksum_cache = AsyncFileHasher( algorithm=HashAlgorithm.BLAKE2B_256, use_async=False ).check_files(files, pbar_unit="B") print(checksum_cache) assert len(checksum_cache) == len(checksums) for i, (s, c) in enumerate(checksums): filename = tmpdir / f"{i}.bin" assert filename in checksum_cache assert checksum_cache[filename] == c
def test_async_file_hasher_empty(monkeypatch, caplog): async def nop_cse(*_, **__): loop = asyncio.events.get_event_loop() loop.set_debug(True) return AsyncNopProcess(b"\n", b"") monkeypatch.setattr(subprocess_async, "create_subprocess_exec", nop_cse) caplog.set_level(logging.DEBUG) checksum_cache = AsyncFileHasher( algorithm=HashAlgorithm.BLAKE2B_256, use_async=True, batch_size=10, ).check_files(["asdf.bin"], pbar_unit="it") print([(r.levelname, r) for r in caplog.records]) print(checksum_cache) assert not any(record.levelname == "WARNING" for record in caplog.records) assert len(checksum_cache) == 0
def index_photos( files: Iterable[Union[str, PathLike]], priority: int = 10, hash_algorithm: HashAlgorithm = DEFAULT_HASH_ALGO, tz_default: Optional[tzinfo] = None, storage_type: str = "HDD", ) -> list[Optional[PhotoFile]]: """ Indexes photo files :param files: the photo file paths to index :param priority: the photos' priority :param hash_algorithm: The hashing algorithm to use for file checksums :param tz_default: The time zone to use if none is set (defaults to local time) :param storage_type: the storage type being indexed (uses more async if SSD) :return: a list of PhotoFiles, with None entries for errors """ logger = logging.getLogger(__name__) if storage_type in ("SSD", "RAID"): async_hashes = True async_exif = cpu_count() else: # concurrent reads of sequential files can lead to thrashing async_hashes = False # exiftool is partially CPU-bound and benefits from async async_exif = min(4, cpu_count()) logger.info("Collecting media hashes") checksum_cache = AsyncFileHasher(algorithm=hash_algorithm, use_async=async_hashes).check_files( files, pbar_unit="B") logger.info("Collecting media dates and times") datetime_cache = AsyncExifTool( num_workers=async_exif).get_best_datetime_batch(files) logger.info("Indexing media") photos = [] exiftool = ExifTool() exiftool.start() for current_file in tqdm(files): if logger.isEnabledFor(logging.DEBUG): tqdm.write(f"Indexing {current_file}") try: pf = PhotoFile.from_file_cached( current_file, checksum_cache=checksum_cache, datetime_cache=datetime_cache, algorithm=hash_algorithm, tz_default=tz_default, priority=priority, ) photos.append(pf) except Exception as e: tqdm.write(f"Error indexing {current_file}", file=sys.stderr) tb_str = "".join( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) tqdm.write(tb_str, file=sys.stderr) photos.append(None) exiftool.terminate() return photos
def test_async_file_hasher_bad_algorithm(): with pytest.raises(HasherException): AsyncFileHasher(algorithm="md5")