Exemplo n.º 1
0
def garbage_collect_storage(storage_id_whitelist):
    """ Performs GC on a possible subset of the storage's with the IDs found in the
      whitelist. The storages in the whitelist will be checked, and any orphaned will
      be removed, with those IDs being returned.
  """
    if len(storage_id_whitelist) == 0:
        return []

    def placements_to_filtered_paths_set(placements_list):
        """ Returns the list of paths to remove from storage, filtered from the given placements
        query by removing any CAS paths that are still referenced by storage(s) in the database.
    """
        with ensure_under_transaction():
            if not placements_list:
                return set()

            # Find the content checksums not referenced by other storages. Any that are, we cannot
            # remove.
            content_checksums = set([
                placement.storage.content_checksum
                for placement in placements_list if placement.storage.cas_path
            ])

            unreferenced_checksums = set()
            if content_checksums:
                # Check the current image storage.
                query = (ImageStorage.select(
                    ImageStorage.content_checksum
                ).where(
                    ImageStorage.content_checksum << list(content_checksums)))
                is_referenced_checksums = set([
                    image_storage.content_checksum for image_storage in query
                ])
                if is_referenced_checksums:
                    logger.warning(
                        'GC attempted to remove CAS checksums %s, which are still IS referenced',
                        is_referenced_checksums)

                # Check the ApprBlob table as well.
                query = ApprBlob.select(ApprBlob.digest).where(
                    ApprBlob.digest << list(content_checksums))
                appr_blob_referenced_checksums = set(
                    [blob.digest for blob in query])
                if appr_blob_referenced_checksums:
                    logger.warning(
                        'GC attempted to remove CAS checksums %s, which are ApprBlob referenced',
                        appr_blob_referenced_checksums)

                unreferenced_checksums = (content_checksums -
                                          appr_blob_referenced_checksums -
                                          is_referenced_checksums)

            # Return all placements for all image storages found not at a CAS path or with a content
            # checksum that is referenced.
            return {
                (get_image_location_for_id(placement.location_id).name,
                 get_layer_path(placement.storage))
                for placement in placements_list
                if not placement.storage.cas_path
                or placement.storage.content_checksum in unreferenced_checksums
            }

    # Note: Both of these deletes must occur in the same transaction (unfortunately) because a
    # storage without any placement is invalid, and a placement cannot exist without a storage.
    # TODO: We might want to allow for null storages on placements, which would allow us to
    # delete the storages, then delete the placements in a non-transaction.
    logger.debug('Garbage collecting storages from candidates: %s',
                 storage_id_whitelist)
    with db_transaction():
        orphaned_storage_ids = _orphaned_storage_query(storage_id_whitelist)
        if len(orphaned_storage_ids) == 0:
            # Nothing to GC.
            return []

        placements_to_remove = list(
            ImageStoragePlacement.select(
                ImageStoragePlacement, ImageStorage).join(ImageStorage).where(
                    ImageStorage.id << orphaned_storage_ids))

        # Remove the placements for orphaned storages
        if len(placements_to_remove) > 0:
            placement_ids_to_remove = [
                placement.id for placement in placements_to_remove
            ]
            placements_removed = (ImageStoragePlacement.delete().where(
                ImageStoragePlacement.id << placement_ids_to_remove).execute())
            logger.debug('Removed %s image storage placements',
                         placements_removed)

        # Remove all orphaned storages
        torrents_removed = (TorrentInfo.delete().where(
            TorrentInfo.storage << orphaned_storage_ids).execute())
        logger.debug('Removed %s torrent info records', torrents_removed)

        signatures_removed = (ImageStorageSignature.delete().where(
            ImageStorageSignature.storage << orphaned_storage_ids).execute())
        logger.debug('Removed %s image storage signatures', signatures_removed)

        storages_removed = (ImageStorage.delete().where(
            ImageStorage.id << orphaned_storage_ids).execute())
        logger.debug('Removed %s image storage records', storages_removed)

        # Determine the paths to remove. We cannot simply remove all paths matching storages, as CAS
        # can share the same path. We further filter these paths by checking for any storages still in
        # the database with the same content checksum.
        paths_to_remove = placements_to_filtered_paths_set(
            placements_to_remove)

    # We are going to make the conscious decision to not delete image storage blobs inside
    # transactions.
    # This may end up producing garbage in s3, trading off for higher availability in the database.
    for location_name, image_path in paths_to_remove:
        logger.debug('Removing %s from %s', image_path, location_name)
        config.store.remove({location_name}, image_path)

    return orphaned_storage_ids
Exemplo n.º 2
0
def garbage_collect_storage(storage_id_whitelist):
    """
    Performs GC on a possible subset of the storage's with the IDs found in the whitelist.

    The storages in the whitelist will be checked, and any orphaned will be removed, with those IDs
    being returned.
    """
    if len(storage_id_whitelist) == 0:
        return []

    def placements_to_filtered_paths_set(placements_list):
        """
        Returns the list of paths to remove from storage, filtered from the given placements query
        by removing any CAS paths that are still referenced by storage(s) in the database.
        """
        if not placements_list:
            return set()

        with ensure_under_transaction():
            # Find the content checksums not referenced by other storages. Any that are, we cannot
            # remove.
            content_checksums = set(
                [
                    placement.storage.content_checksum
                    for placement in placements_list
                    if placement.storage.cas_path
                ]
            )

            unreferenced_checksums = set()
            if content_checksums:
                # Check the current image storage.
                query = ImageStorage.select(ImageStorage.content_checksum).where(
                    ImageStorage.content_checksum << list(content_checksums)
                )
                is_referenced_checksums = set(
                    [image_storage.content_checksum for image_storage in query]
                )
                if is_referenced_checksums:
                    logger.warning(
                        "GC attempted to remove CAS checksums %s, which are still IS referenced",
                        is_referenced_checksums,
                    )

                # Check the ApprBlob table as well.
                query = ApprBlob.select(ApprBlob.digest).where(
                    ApprBlob.digest << list(content_checksums)
                )
                appr_blob_referenced_checksums = set([blob.digest for blob in query])
                if appr_blob_referenced_checksums:
                    logger.warning(
                        "GC attempted to remove CAS checksums %s, which are ApprBlob referenced",
                        appr_blob_referenced_checksums,
                    )

                unreferenced_checksums = (
                    content_checksums - appr_blob_referenced_checksums - is_referenced_checksums
                )

            # Return all placements for all image storages found not at a CAS path or with a content
            # checksum that is referenced.
            return {
                (
                    get_image_location_for_id(placement.location_id).name,
                    get_layer_path(placement.storage),
                    placement.storage.content_checksum,
                )
                for placement in placements_list
                if not placement.storage.cas_path
                or placement.storage.content_checksum in unreferenced_checksums
            }

    # Note: Both of these deletes must occur in the same transaction (unfortunately) because a
    # storage without any placement is invalid, and a placement cannot exist without a storage.
    # TODO: We might want to allow for null storages on placements, which would allow us to
    # delete the storages, then delete the placements in a non-transaction.
    logger.debug("Garbage collecting storages from candidates: %s", storage_id_whitelist)
    paths_to_remove = []
    orphaned_storage_ids = set()
    for storage_id_to_check in storage_id_whitelist:
        logger.debug("Garbage collecting storage %s", storage_id_to_check)

        with db_transaction():
            if not _is_storage_orphaned(storage_id_to_check):
                continue

            orphaned_storage_ids.add(storage_id_to_check)

            placements_to_remove = list(
                ImageStoragePlacement.select(ImageStoragePlacement, ImageStorage)
                .join(ImageStorage)
                .where(ImageStorage.id == storage_id_to_check)
            )

            # Remove the placements for orphaned storages
            if placements_to_remove:
                ImageStoragePlacement.delete().where(
                    ImageStoragePlacement.storage == storage_id_to_check
                ).execute()

            # Remove all orphaned storages
            TorrentInfo.delete().where(TorrentInfo.storage == storage_id_to_check).execute()

            ImageStorageSignature.delete().where(
                ImageStorageSignature.storage == storage_id_to_check
            ).execute()

            ImageStorage.delete().where(ImageStorage.id == storage_id_to_check).execute()

            # Determine the paths to remove. We cannot simply remove all paths matching storages, as CAS
            # can share the same path. We further filter these paths by checking for any storages still in
            # the database with the same content checksum.
            paths_to_remove.extend(placements_to_filtered_paths_set(placements_to_remove))

    # We are going to make the conscious decision to not delete image storage blobs inside
    # transactions.
    # This may end up producing garbage in s3, trading off for higher availability in the database.
    paths_to_remove = list(set(paths_to_remove))
    for location_name, image_path, storage_checksum in paths_to_remove:
        if storage_checksum:
            # Skip any specialized blob digests that we know we should keep around.
            if storage_checksum in SPECIAL_BLOB_DIGESTS:
                continue

            # Perform one final check to ensure the blob is not needed.
            if (
                ImageStorage.select()
                .where(ImageStorage.content_checksum == storage_checksum)
                .exists()
            ):
                continue

        logger.debug("Removing %s from %s", image_path, location_name)
        config.store.remove({location_name}, image_path)

    return orphaned_storage_ids