Пример #1
0
def cleanup_archive(prefix=None):
    """Clean up the blob archive behind aleph. Files inside of the archive
    are keyed on their SHA1 checksum, but the archive itself doesn't know
    what entities or exports a blob is linked to. So this is basically a
    garbage collector that needs to determine if any part of the database
    or index references the given hash. It's a messy process and it should
    be applied carefully."""
    for batch in _chunked_hashes(prefix):
        for content_hash, count in checksums_count(batch):
            if count > 0:
                # log.info("Used hash: %s", content_hash)
                continue
            # In theory, this is a redundant check. In practice, it's shit
            # to delete seed data from the docs table by accident:
            docs = Document.by_content_hash(content_hash)
            if docs.count() > 0:
                # log.info("Doc hash: %s", content_hash)
                continue
            exports = Export.by_content_hash(content_hash)
            if exports.count() > 0:
                continue
            # path = archive.load_file(content_hash)
            # log.info("Dangling hash [%s]: %s", content_hash, path)
            log.info("Dangling hash: %s", content_hash)
            archive.delete_file(content_hash)
Пример #2
0
def delete_expired_exports():
    """Delete export files from the archive after their time
    limit has expired."""
    expired_exports = Export.get_expired(deleted=False)
    for export in expired_exports:
        log.info("Deleting expired export: %r", export)
        if export.should_delete_publication():
            counts = list(checksums_count([export.content_hash]))
            if counts[0][1] == 0:
                archive.delete_file(export.content_hash)
        export.deleted = True
        db.session.add(export)
    db.session.commit()