def cleanup_archive(prefix=None): """Clean up the blob archive behind aleph. Files inside of the archive are keyed on their SHA1 checksum, but the archive itself doesn't know what entities or exports a blob is linked to. So this is basically a garbage collector that needs to determine if any part of the database or index references the given hash. It's a messy process and it should be applied carefully.""" for batch in _chunked_hashes(prefix): for content_hash, count in checksums_count(batch): if count > 0: # log.info("Used hash: %s", content_hash) continue # In theory, this is a redundant check. In practice, it's shit # to delete seed data from the docs table by accident: docs = Document.by_content_hash(content_hash) if docs.count() > 0: # log.info("Doc hash: %s", content_hash) continue exports = Export.by_content_hash(content_hash) if exports.count() > 0: continue # path = archive.load_file(content_hash) # log.info("Dangling hash [%s]: %s", content_hash, path) log.info("Dangling hash: %s", content_hash) archive.delete_file(content_hash)
def delete_expired_exports(): """Delete export files from the archive after their time limit has expired.""" expired_exports = Export.get_expired(deleted=False) for export in expired_exports: log.info("Deleting expired export: %r", export) if export.should_delete_publication(): counts = list(checksums_count([export.content_hash])) if counts[0][1] == 0: archive.delete_file(export.content_hash) export.deleted = True db.session.add(export) db.session.commit()