def from_file( cls: Type[PF], source_path: Union[str, PathLike], algorithm: HashAlgorithm = DEFAULT_HASH_ALGO, tz_default: Optional[tzinfo] = None, priority: int = 10, ) -> PF: """Create a PhotoFile for a given file :param source_path: The path to the file :param algorithm: The hashing algorithm to use :param tz_default: The time zone to use if none is set (defaults to local time) :param priority: The photo's priority """ photo_hash = file_checksum(source_path, algorithm) dt_str = get_media_datetime(source_path) dt = datetime_str_to_object(dt_str, tz_default=tz_default) tz = dt.utcoffset().total_seconds() if dt.tzinfo is not None else local_tzoffset timestamp = dt.timestamp() file_size = getsize(source_path) return cls( chk=photo_hash, src=str(source_path), dt=dt_str, ts=timestamp, fsz=file_size, sto="", prio=priority, tzo=tz, )
def from_file_cached( cls: Type[PF], source_path: str, checksum_cache: dict[str, str], datetime_cache: dict[str, str], algorithm: HashAlgorithm = DEFAULT_HASH_ALGO, tz_default: Optional[tzinfo] = None, priority: int = 10, ) -> PF: """Create a PhotoFile for a given file If source_path is in the checksum and datetime caches, uses the cached value instead of reading from the file. :param source_path: The path to the file :param checksum_cache: A mapping of source paths to known checksums :param datetime_cache: A mapping of source paths to datetime strings :param algorithm: The hashing algorithm to use for new checksums :param tz_default: The time zone to use if none is set (defaults to local time) :param priority: The photo's priority """ photo_hash = ( checksum_cache[source_path] if source_path in checksum_cache else file_checksum(source_path, algorithm) ) dt_str = ( datetime_cache[source_path] if source_path in datetime_cache else get_media_datetime(source_path) ) dt = datetime_str_to_object(dt_str, tz_default=tz_default) tz = dt.utcoffset().total_seconds() if dt.tzinfo else None timestamp = dt.timestamp() file_size = getsize(source_path) return cls( chk=photo_hash, src=str(source_path), dt=dt_str, ts=timestamp, fsz=file_size, sto="", prio=priority, tzo=tz, )
def make_hash_map( database: Database, new_algo: HashAlgorithm, hash_map: Optional[dict[str, str]] = None, destination: Optional[Union[str, PathLike]] = None, ) -> dict[str, str]: # pragma: no cover """Make a map of file checksums in order to migrate hashing algorithms. Checks source file hashes using the old algorithm to make sure the new hashes are correct. If the source has an incorrect hash, does not map checksum and instead sets the mapped checksum to '{old_checksum}:{old_algorithm}' Note: This method is not accessed by the CLI or covered by testing and is intended to be used interactively, i.e. in a Jupyter notebook or other environment, with spot checking of the output hash map. :param database: the Database :param new_algo: the new algorithm to use :param hash_map: a map from old hashes to new hashes; will be updated with new mappings as they are found :param destination: the library storage destination :return: the hash map """ if hash_map is None: hash_map = {} old_algo = database.hash_algorithm print(f"Converting {old_algo} to {new_algo}") num_correct_photos = ( num_incorrect_photos) = num_missing_photos = num_skipped_photos = 0 all_photos = [ photo for photos in database.photo_db.values() for photo in photos ] for photo in tqdm(all_photos): if photo.chk in hash_map: num_skipped_photos += 1 elif exists(photo.src): if photo.chk == file_checksum(photo.src, old_algo): hash_map[photo.chk] = file_checksum(photo.src, new_algo) num_correct_photos += 1 else: tqdm.write(f"Incorrect checksum: {photo.src}") hash_map[photo.chk] = photo.chk + f":{old_algo}" num_incorrect_photos += 1 elif destination: sto_path = Path(destination).expanduser().resolve() / photo.sto if exists(sto_path) and photo.chk == file_checksum( sto_path, old_algo): hash_map[photo.chk] = file_checksum(sto_path, new_algo) num_correct_photos += 1 else: num_missing_photos += 1 else: num_missing_photos += 1 print(f"Mapped {num_correct_photos} items") if num_skipped_photos: print(f"Skipped {num_skipped_photos} items") if num_incorrect_photos or num_missing_photos: print(f"Found {num_incorrect_photos} incorrect and " f"{num_missing_photos} missing items") return hash_map
def update_stored_filename_hashes( database: Database, destination: Union[str, PathLike], verify: bool = True, dry_run: bool = False, ) -> dict[str, str]: # pragma: no cover """Updates filenames to match checksums. Run after mapping hashes to new algorithm with self.map_hashes() Skips files whose filename checksum matches the stored checksum. Note: This method is not accessed by the CLI or covered by testing and is intended to be used interactively, i.e. in a Jupyter notebook or other environment, with dry runs and spot checking of proposed changes before they are performed. Use at your own risk. :param database: the Database :param destination: the photo storage directory :param verify: if True, verify that file checksums match :param dry_run: if True, perform a dry run and do not move photos :return: the mapping of files moved """ num_correct_photos = ( num_skipped_photos) = num_incorrect_photos = num_missing_photos = 0 destination = Path(destination).expanduser().resolve() stored_photos = [ photo for photos in database.photo_db.values() for photo in photos if photo.sto ] total_file_size = sum(photo.fsz for photo in stored_photos) print(f"Updating {len(stored_photos)} filename hashes") print(f"Total file size: {sizeof_fmt(total_file_size)}") logger = logging.getLogger() file_map = {} for photo in tqdm(stored_photos): abs_store_path = destination / photo.sto new_store_path = f"{photo.sto[:32]}{photo.chk[:7]}{photo.sto[39:]}" new_abs_store_path = destination / new_store_path if new_abs_store_path.exists(): num_skipped_photos += 1 elif not abs_store_path.exists(): tqdm.write(f"Missing photo: {abs_store_path}") num_missing_photos += 1 elif photo.sto[32:39] == photo.chk[:7]: num_skipped_photos += 1 elif (not verify or file_checksum( abs_store_path, database.hash_algorithm) == photo.chk): if logger.isEnabledFor(logging.DEBUG): tqdm.write( f"{'Will move' if dry_run else 'Moving'} {abs_store_path} " f"to {new_abs_store_path}") file_map[str(abs_store_path)] = str(new_abs_store_path) if not dry_run: rename(abs_store_path, new_abs_store_path) photo.sto = new_store_path num_correct_photos += 1 else: tqdm.write(f"Incorrect checksum: {abs_store_path}") num_incorrect_photos += 1 print(f"{'Would move' if dry_run else 'Moved'} {num_correct_photos} items") if num_skipped_photos: print(f"Skipped {num_skipped_photos} items") if num_incorrect_photos or num_missing_photos: print(f"Found {num_incorrect_photos} incorrect and " f"{num_missing_photos} missing items") return file_map
from typing import Union, Dict import pytest from photomanager.hasher import AsyncFileHasher, file_checksum, HashAlgorithm checksums = [ (b"", "0e5751c026e543b2e8ab2eb06099daa1d1e5df47778f7787faab45cdf12fe3a8"), ( b"\xff\xd8\xff\xe0", "7d13007a8afed521cfc13306cbd6747bbc59556e3ca9514c8d94f900fbb56230", ), (b"test", "928b20366943e2afd11ebc0eae2e53a93bf177a4fcf35bcc64d503704e65e202"), ] for _ in range(100): st = bytes([random.randint(0, 255) for _ in range(1000)]) with BytesIO(st) as fd: ck = file_checksum(fd, algorithm=HashAlgorithm.BLAKE2B_256) checksums.append((st, ck)) def test_file_hasher(tmpdir): files = [] for i, (s, c) in enumerate(checksums): filename = tmpdir / f"{i}.bin" with open(filename, "wb") as f: f.write(s) files.append(filename) checksum_cache = AsyncFileHasher( algorithm=HashAlgorithm.BLAKE2B_256, use_async=False ).check_files(files, pbar_unit="B") print(checksum_cache) assert len(checksum_cache) == len(checksums)
def test_file_checksum_bad_algorithm(): with pytest.raises(HasherException): file_checksum("asdf.txt", algorithm="md5")
def test_file_checksum_path(checksum, tmpdir): with open(tmpdir / "test.bin", "wb") as f: f.write(checksum["bytes"]) assert (file_checksum( tmpdir / "test.bin", algorithm=checksum["algorithm"]) == checksum["checksum"])
def test_file_checksum_fd(checksum): with BytesIO(checksum["bytes"]) as f: assert file_checksum( f, algorithm=checksum["algorithm"]) == checksum["checksum"] assert not f.closed