def test_images_shared_cas(default_tag_policy, initialized_db): """ A repository, each two tags, pointing to the same image, which has image storage with the same *CAS path*, but *distinct records*. Deleting the first tag should delete the first image, and its storage, but not the file in storage, as it shares its CAS path. """ with assert_gc_integrity(expect_storage_removed=True): repository = create_repository() # Create two image storage records with the same content checksum. content = "hello world" digest = "sha256:" + hashlib.sha256(content).hexdigest() preferred = storage.preferred_locations[0] storage.put_content({preferred}, storage.blob_path(digest), content) is1 = database.ImageStorage.create(content_checksum=digest, uploading=False) is2 = database.ImageStorage.create(content_checksum=digest, uploading=False) location = database.ImageStorageLocation.get(name=preferred) database.ImageStoragePlacement.create(location=location, storage=is1) database.ImageStoragePlacement.create(location=location, storage=is2) # Ensure the CAS path exists. assert storage.exists({preferred}, storage.blob_path(digest)) # Create two images in the repository, and two tags, each pointing to one of the storages. first_image = Image.create( docker_image_id="i1", repository=repository, storage=is1, ancestors="/" ) second_image = Image.create( docker_image_id="i2", repository=repository, storage=is2, ancestors="/" ) store_tag_manifest( repository.namespace_user.username, repository.name, "first", first_image.docker_image_id, ) store_tag_manifest( repository.namespace_user.username, repository.name, "second", second_image.docker_image_id, ) assert_not_deleted(repository, "i1", "i2") # Delete the first tag. delete_tag(repository, "first") assert_deleted(repository, "i1") assert_not_deleted(repository, "i2") # Ensure the CAS path still exists. assert storage.exists({preferred}, storage.blob_path(digest))
def test_image_with_cas(default_tag_policy, initialized_db): """ A repository with a tag pointing to an image backed by CAS. Deleting and GCing the tag should result in the storage and its CAS data being removed. """ with assert_gc_integrity(expect_storage_removed=True): repository = create_repository() # Create an image storage record under CAS. content = b"hello world" digest = "sha256:" + hashlib.sha256(content).hexdigest() preferred = storage.preferred_locations[0] storage.put_content({preferred}, storage.blob_path(digest), content) image_storage = database.ImageStorage.create(content_checksum=digest) location = database.ImageStorageLocation.get(name=preferred) database.ImageStoragePlacement.create(location=location, storage=image_storage) # Temp link so its available. model.blob.store_blob_record_and_temp_link_in_repo( repository, digest, location, len(content), 120 ) # Ensure the CAS path exists. assert storage.exists({preferred}, storage.blob_path(digest)) # Store a manifest pointing to that path. builder = DockerSchema1ManifestBuilder( repository.namespace_user.username, repository.name, "first" ) builder.insert_layer( digest, json.dumps( { "id": "i1", } ), ) # Store the manifest. manifest = builder.build(docker_v2_signing_key) repo_ref = RepositoryReference.for_repo_obj(repository) registry_model.create_manifest_and_retarget_tag( repo_ref, manifest, "first", storage, raise_on_error=True ) # Delete the temp reference. _delete_temp_links(repository) # Delete the tag. delete_tag(repository, "first") assert_deleted(repository, "i1") # Ensure the CAS path is gone. assert not storage.exists({preferred}, storage.blob_path(digest))
def test_purge_repository_storage_blob(default_tag_policy, initialized_db): with populate_storage_for_gc(): expected_blobs_removed_from_storage = set() preferred = storage.preferred_locations[0] # Check that existing uploadedblobs has an object in storage for repo in database.Repository.select().order_by(database.Repository.id): for uploadedblob in UploadedBlob.select().where(UploadedBlob.repository == repo): assert storage.exists( {preferred}, storage.blob_path(uploadedblob.blob.content_checksum) ) # Remove eveyrhing for repo in database.Repository.select(): # .order_by(database.Repository.id): for uploadedblob in UploadedBlob.select().where(UploadedBlob.repository == repo): # Check if only this repository is referencing the uploadedblob # If so, the blob should be removed from storage has_depedent_manifestblob = ( ManifestBlob.select() .where( ManifestBlob.blob == uploadedblob.blob, ManifestBlob.repository != repo, ) .count() ) has_dependent_image = ( Image.select() .where( Image.storage == uploadedblob.blob, Image.repository != repo, ) .count() ) has_dependent_uploadedblobs = ( UploadedBlob.select() .where( UploadedBlob == uploadedblob, UploadedBlob.repository != repo, ) .count() ) if ( not has_depedent_manifestblob and not has_dependent_image and not has_dependent_uploadedblobs ): expected_blobs_removed_from_storage.add(uploadedblob.blob) assert model.gc.purge_repository(repo, force=True) for removed_blob_from_storage in expected_blobs_removed_from_storage: assert not storage.exists( {preferred}, storage.blob_path(removed_blob_from_storage.content_checksum) )
def populate_storage_for_gc(): """ Populate FakeStorage with dummy data for each ImageStorage row. """ preferred = storage.preferred_locations[0] for storage_row in ImageStorage.select(): content = b"hello world" storage.put_content({preferred}, storage.blob_path(storage_row.content_checksum), content) assert storage.exists({preferred}, storage.blob_path(storage_row.content_checksum)) yield
def test_images_shared_cas_with_new_blob_table(default_tag_policy, initialized_db): """ A repository with a tag and image that shares its CAS path with a record in the new Blob table. Deleting the first tag should delete the first image, and its storage, but not the file in storage, as it shares its CAS path with the blob row. """ with assert_gc_integrity(expect_storage_removed=True): repository = create_repository() # Create two image storage records with the same content checksum. content = "hello world" digest = "sha256:" + hashlib.sha256(content).hexdigest() preferred = storage.preferred_locations[0] storage.put_content({preferred}, storage.blob_path(digest), content) media_type = database.MediaType.get(name="text/plain") is1 = database.ImageStorage.create(content_checksum=digest, uploading=False) database.ApprBlob.create(digest=digest, size=0, media_type=media_type) location = database.ImageStorageLocation.get(name=preferred) database.ImageStoragePlacement.create(location=location, storage=is1) # Ensure the CAS path exists. assert storage.exists({preferred}, storage.blob_path(digest)) # Create the image in the repository, and the tag. first_image = Image.create(docker_image_id="i1", repository=repository, storage=is1, ancestors="/") store_tag_manifest( repository.namespace_user.username, repository.name, "first", first_image.docker_image_id, ) assert_not_deleted(repository, "i1") # Delete the tag. delete_tag(repository, "first") assert_deleted(repository, "i1") # Ensure the CAS path still exists, as it is referenced by the Blob table assert storage.exists({preferred}, storage.blob_path(digest))
def test_manifest_v2_shared_config_and_blobs(app, default_tag_policy): """ Test that GCing a tag that refers to a V2 manifest with the same config and some shared blobs as another manifest ensures that the config blob and shared blob are NOT GCed. """ repo = model.repository.create_repository("devtable", "newrepo", None) manifest1, built1 = create_manifest_for_testing(repo, differentiation_field="1", include_shared_blob=True) manifest2, built2 = create_manifest_for_testing(repo, differentiation_field="2", include_shared_blob=True) assert set(built1.local_blob_digests).intersection( built2.local_blob_digests) assert built1.config.digest == built2.config.digest # Create tags pointing to the manifests. model.oci.tag.retarget_tag("tag1", manifest1) model.oci.tag.retarget_tag("tag2", manifest2) with assert_gc_integrity(expect_storage_removed=True): # Delete tag2. model.oci.tag.delete_tag(repo, "tag2") assert gc_now(repo) # Ensure the blobs for manifest1 still all exist. preferred = storage.preferred_locations[0] for blob_digest in built1.local_blob_digests: storage_row = ImageStorage.get(content_checksum=blob_digest) assert storage_row.cas_path storage.get_content({preferred}, storage.blob_path(storage_row.content_checksum))
def _populate_blob(repo, content): digest = str(sha256_digest(content)) location = ImageStorageLocation.get(name="local_us") storage.put_content(["local_us"], storage.blob_path(digest), "somecontent") blob = model.blob.store_blob_record_and_temp_link_in_repo( repo, digest, location, len(content), 120) return blob, digest
def _populate_blob(repo, content): assert isinstance(content, bytes) digest = sha256_digest(content) location = ImageStorageLocation.get(name="local_us") storage.put_content(["local_us"], storage.blob_path(digest), content) blob = model.blob.store_blob_record_and_temp_link_in_repo( repo, digest, location, len(content), 120) return blob, digest
def test_image_with_cas(default_tag_policy, initialized_db): """ A repository with a tag pointing to an image backed by CAS. Deleting and GCing the tag should result in the storage and its CAS data being removed. """ with assert_gc_integrity(expect_storage_removed=True): repository = create_repository() # Create an image storage record under CAS. content = "hello world" digest = "sha256:" + hashlib.sha256(content).hexdigest() preferred = storage.preferred_locations[0] storage.put_content({preferred}, storage.blob_path(digest), content) image_storage = database.ImageStorage.create(content_checksum=digest, uploading=False) location = database.ImageStorageLocation.get(name=preferred) database.ImageStoragePlacement.create(location=location, storage=image_storage) # Ensure the CAS path exists. assert storage.exists({preferred}, storage.blob_path(digest)) # Create the image and the tag. first_image = Image.create(docker_image_id="i1", repository=repository, storage=image_storage, ancestors="/") store_tag_manifest( repository.namespace_user.username, repository.name, "first", first_image.docker_image_id, ) assert_not_deleted(repository, "i1") # Delete the tag. delete_tag(repository, "first") assert_deleted(repository, "i1") # Ensure the CAS path is gone. assert not storage.exists({preferred}, storage.blob_path(digest))
def test_garbage_collect_storage(default_tag_policy, initialized_db): with populate_storage_for_gc(): preferred = storage.preferred_locations[0] # Get a random sample of storages uploadedblobs = list(UploadedBlob.select()) random_uploadedblobs = random.sample( uploadedblobs, random.randrange(1, len(uploadedblobs) + 1) ) model.storage.garbage_collect_storage([b.blob.id for b in random_uploadedblobs]) # Ensure that the blobs' storage weren't removed, since we didn't GC anything for uploadedblob in random_uploadedblobs: assert storage.exists( {preferred}, storage.blob_path(uploadedblob.blob.content_checksum) )
def test_images_shared_cas(default_tag_policy, initialized_db): """ A repository, each two tags, pointing to the same image, which has image storage with the same *CAS path*, but *distinct records*. Deleting the first tag should delete the first image, and its storage, but not the file in storage, as it shares its CAS path. """ with assert_gc_integrity(expect_storage_removed=True): repository = create_repository() # Create two image storage records with the same content checksum. content = b"hello world" digest = "sha256:" + hashlib.sha256(content).hexdigest() preferred = storage.preferred_locations[0] storage.put_content({preferred}, storage.blob_path(digest), content) is1 = database.ImageStorage.create(content_checksum=digest) is2 = database.ImageStorage.create(content_checksum=digest) location = database.ImageStorageLocation.get(name=preferred) database.ImageStoragePlacement.create(location=location, storage=is1) database.ImageStoragePlacement.create(location=location, storage=is2) # Temp link so its available. model.blob.store_blob_record_and_temp_link_in_repo( repository, digest, location, len(content), 120) # Ensure the CAS path exists. assert storage.exists({preferred}, storage.blob_path(digest)) repo_ref = RepositoryReference.for_repo_obj(repository) # Store a manifest pointing to that path as `first`. builder = DockerSchema1ManifestBuilder( repository.namespace_user.username, repository.name, "first") builder.insert_layer( digest, json.dumps({ "id": "i1", }), ) manifest = builder.build(docker_v2_signing_key) registry_model.create_manifest_and_retarget_tag(repo_ref, manifest, "first", storage, raise_on_error=True) tag_ref = registry_model.get_repo_tag(repo_ref, "first") manifest_ref = registry_model.get_manifest_for_tag(tag_ref) registry_model.populate_legacy_images_for_testing( manifest_ref, storage) # Store another as `second`. builder = DockerSchema1ManifestBuilder( repository.namespace_user.username, repository.name, "second") builder.insert_layer( digest, json.dumps({ "id": "i2", }), ) manifest = builder.build(docker_v2_signing_key) created, _ = registry_model.create_manifest_and_retarget_tag( repo_ref, manifest, "second", storage, raise_on_error=True) tag_ref = registry_model.get_repo_tag(repo_ref, "second") manifest_ref = registry_model.get_manifest_for_tag(tag_ref) registry_model.populate_legacy_images_for_testing( manifest_ref, storage) # Manually retarget the second manifest's blob to the second row. try: second_blob = ManifestBlob.get(manifest=created._db_id, blob=is1) second_blob.blob = is2 second_blob.save() except ManifestBlob.DoesNotExist: second_blob = ManifestBlob.get(manifest=created._db_id, blob=is2) second_blob.blob = is1 second_blob.save() # Delete the temp reference. _delete_temp_links(repository) # Ensure the legacy images exist. assert_not_deleted(repository, "i1", "i2") # Delete the first tag. delete_tag(repository, "first") assert_deleted(repository, "i1") assert_not_deleted(repository, "i2") # Ensure the CAS path still exists. assert storage.exists({preferred}, storage.blob_path(digest))
def assert_gc_integrity(expect_storage_removed=True): """ Specialized assertion for ensuring that GC cleans up all dangling storages and labels, invokes the callback for images removed and doesn't invoke the callback for images *not* removed. """ # Add a callback for when images are removed. removed_image_storages = [] remove_callback = model.config.register_image_cleanup_callback( removed_image_storages.extend) # Store existing storages. We won't verify these for existence because they # were likely created as test data. existing_digests = set() for storage_row in ImageStorage.select(): if storage_row.cas_path: existing_digests.add(storage_row.content_checksum) for blob_row in ApprBlob.select(): existing_digests.add(blob_row.digest) # Store the number of dangling objects. existing_storage_count = _get_dangling_storage_count() existing_label_count = _get_dangling_label_count() existing_manifest_count = _get_dangling_manifest_count() # Yield to the GC test. with check_transitive_modifications(): try: yield finally: remove_callback() # Ensure the number of dangling storages, manifests and labels has not changed. updated_storage_count = _get_dangling_storage_count() assert updated_storage_count == existing_storage_count updated_label_count = _get_dangling_label_count() assert updated_label_count == existing_label_count, _get_dangling_labels() updated_manifest_count = _get_dangling_manifest_count() assert updated_manifest_count == existing_manifest_count # Ensure that for each call to the image+storage cleanup callback, the image and its # storage is not found *anywhere* in the database. for removed_image_and_storage in removed_image_storages: assert isinstance(removed_image_and_storage, Image) try: # NOTE: SQLite can and will reuse AUTOINCREMENT IDs occasionally, so if we find a row # with the same ID, make sure it does not have the same Docker Image ID. # See: https://www.sqlite.org/autoinc.html found_image = Image.get(id=removed_image_and_storage.id) assert (found_image.docker_image_id != removed_image_and_storage.docker_image_id ), "Found unexpected removed image %s under repo %s" % ( found_image.id, found_image.repository, ) except Image.DoesNotExist: pass # Ensure that image storages are only removed if not shared. shared = Image.select().where( Image.storage == removed_image_and_storage.storage_id).count() if shared == 0: shared = (ManifestBlob.select().where( ManifestBlob.blob == removed_image_and_storage.storage_id).count()) if shared == 0: shared = (UploadedBlob.select().where( UploadedBlob.blob == removed_image_and_storage.storage_id).count()) if shared == 0: with pytest.raises(ImageStorage.DoesNotExist): ImageStorage.get(id=removed_image_and_storage.storage_id) with pytest.raises(ImageStorage.DoesNotExist): ImageStorage.get(uuid=removed_image_and_storage.storage.uuid) # Ensure all CAS storage is in the storage engine. preferred = storage.preferred_locations[0] for storage_row in ImageStorage.select(): if storage_row.content_checksum in existing_digests: continue if storage_row.cas_path: storage.get_content({preferred}, storage.blob_path( storage_row.content_checksum)) for blob_row in ApprBlob.select(): if blob_row.digest in existing_digests: continue storage.get_content({preferred}, storage.blob_path(blob_row.digest)) # Ensure all tags have valid manifests. for manifest in {t.manifest for t in Tag.select()}: # Ensure that the manifest's blobs all exist. found_blobs = { b.blob.content_checksum for b in ManifestBlob.select().where( ManifestBlob.manifest == manifest) } parsed = parse_manifest_from_bytes( Bytes.for_string_or_unicode(manifest.manifest_bytes), manifest.media_type.name) assert set(parsed.local_blob_digests) == found_blobs
def assert_gc_integrity(expect_storage_removed=True, check_oci_tags=True): """ Specialized assertion for ensuring that GC cleans up all dangling storages and labels, invokes the callback for images removed and doesn't invoke the callback for images *not* removed. """ # Add a callback for when images are removed. removed_image_storages = [] model.config.register_image_cleanup_callback(removed_image_storages.extend) # Store the number of dangling storages and labels. existing_storage_count = _get_dangling_storage_count() existing_label_count = _get_dangling_label_count() existing_manifest_count = _get_dangling_manifest_count() yield # Ensure the number of dangling storages, manifests and labels has not changed. updated_storage_count = _get_dangling_storage_count() assert updated_storage_count == existing_storage_count updated_label_count = _get_dangling_label_count() assert updated_label_count == existing_label_count, _get_dangling_labels() updated_manifest_count = _get_dangling_manifest_count() assert updated_manifest_count == existing_manifest_count # Ensure that for each call to the image+storage cleanup callback, the image and its # storage is not found *anywhere* in the database. for removed_image_and_storage in removed_image_storages: with pytest.raises(Image.DoesNotExist): Image.get(id=removed_image_and_storage.id) # Ensure that image storages are only removed if not shared. shared = Image.select().where( Image.storage == removed_image_and_storage.storage_id).count() if shared == 0: shared = (ManifestBlob.select().where( ManifestBlob.blob == removed_image_and_storage.storage_id).count()) if shared == 0: with pytest.raises(ImageStorage.DoesNotExist): ImageStorage.get(id=removed_image_and_storage.storage_id) with pytest.raises(ImageStorage.DoesNotExist): ImageStorage.get(uuid=removed_image_and_storage.storage.uuid) # Ensure all CAS storage is in the storage engine. preferred = storage.preferred_locations[0] for storage_row in ImageStorage.select(): if storage_row.cas_path: storage.get_content({preferred}, storage.blob_path( storage_row.content_checksum)) for blob_row in ApprBlob.select(): storage.get_content({preferred}, storage.blob_path(blob_row.digest)) # Ensure there are no danglings OCI tags. if check_oci_tags: oci_tags = {t.id for t in Tag.select()} referenced_oci_tags = {t.tag_id for t in TagToRepositoryTag.select()} assert not oci_tags - referenced_oci_tags # Ensure all tags have valid manifests. for manifest in {t.manifest for t in Tag.select()}: # Ensure that the manifest's blobs all exist. found_blobs = { b.blob.content_checksum for b in ManifestBlob.select().where( ManifestBlob.manifest == manifest) } parsed = parse_manifest_from_bytes( Bytes.for_string_or_unicode(manifest.manifest_bytes), manifest.media_type.name) assert set(parsed.local_blob_digests) == found_blobs