def test_pull(local_engine_empty, pg_repo_remote, download_all): # Pull the schema from the remote # Here, it's the pg on local_engine that connects to the remote engine, so we can use the actual hostname # (as opposed to the one exposed to us). However, the clone procedure also uses that connection string to talk to # the remote. Hence, there's an /etc/hosts indirection on the host mapping the remote engine to localhost. clone(pg_repo_remote, local_repository=PG_MNT, download_all=download_all) PG_MNT.images.by_hash(pg_repo_remote.head.image_hash).checkout() head_1 = _add_image_to_repo(pg_repo_remote) # Check the data is unchanged on the pulled one. assert PG_MNT.run_sql("SELECT * FROM fruits") == [(1, "apple"), (2, "orange")] with pytest.raises(ImageNotFoundError): PG_MNT.images.by_hash(head_1.image_hash) PG_MNT.pull() head_1 = PG_MNT.images.by_hash(head_1.image_hash) # Check out the newly-pulled commit and verify it has the same data. head_1.checkout() assert PG_MNT.run_sql("SELECT * FROM fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ] assert PG_MNT.head == head_1
def test_pull_download_error(local_engine_empty, unprivileged_pg_repo, clean_minio, interrupted): # Same test backwards: if we're pulling and abort or fail the download, make sure we can # recover and retry pulling the repo. with patch.dict( "splitgraph.hooks.external_objects._EXTERNAL_OBJECT_HANDLERS", {"S3": _flaky_handler(interrupted)}, ): with pytest.raises(Exception) as e: clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=True) # Check that the pull succeeded (repository registered locally) but the objects # are just marked as external, not downloaded assert repository_exists(PG_MNT) assert len(PG_MNT.objects.get_all_objects()) == 2 assert len(PG_MNT.objects.get_downloaded_objects()) == 1 assert len( PG_MNT.objects.get_external_object_locations( PG_MNT.objects.get_all_objects())) == 2 assert (PG_MNT.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_cache_status", return_shape=ResultShape.ONE_ONE, ) == 1) clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=True) assert len(PG_MNT.objects.get_all_objects()) == 2 assert len(PG_MNT.objects.get_downloaded_objects()) == 2 assert len(list(PG_MNT.images)) == 2 assert (PG_MNT.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_cache_status", return_shape=ResultShape.ONE_ONE, ) == 2)
def test_s3_presigned_url(local_engine_empty, unprivileged_pg_repo, clean_minio): # Test the URL signing stored procedure works on the remote machine clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False) PG_MNT.images["latest"].checkout() PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") head = PG_MNT.commit() object_id = head.get_table("fruits").objects[0] # Do a test calling the signer locally (the tests currently have access # to the S3 credentials on the host they're running on) urls_local = get_object_upload_urls("%s:%s" % (S3_HOST, S3_PORT), [object_id]) assert len(urls_local) == 1 assert len(urls_local[0]) == 3 urls_local = get_object_download_urls("%s:%s" % (S3_HOST, S3_PORT), [object_id]) assert len(urls_local) == 1 assert len(urls_local[0]) == 3 urls = unprivileged_pg_repo.engine.run_sql( "SELECT * FROM splitgraph_api.get_object_upload_urls(%s, %s)", ("%s:%s" % (S3_HOST, S3_PORT), [object_id]), return_shape=ResultShape.ONE_ONE, ) assert len(urls) == 1 assert len(urls[0]) == 3
def test_pull_single_image(local_engine_empty, pg_repo_remote, download_all): head = pg_repo_remote.head head_1 = _add_image_to_repo(pg_repo_remote) head.tag("tag_1") head_1.tag("tag_2") pg_repo_remote.commit_engines() # Clone a single image first assert len(PG_MNT.images()) == 0 assert len(PG_MNT.objects.get_downloaded_objects()) == 0 assert len(pg_repo_remote.images()) == 3 clone( pg_repo_remote, local_repository=PG_MNT, download_all=download_all, single_image=head.image_hash[:12], ) # Check only one image got downloaded and check we didn't try # to pull tags for images that we weren't pulling. assert len(PG_MNT.images()) == 1 assert PG_MNT.images()[0] == head assert PG_MNT.images["tag_1"] == head assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None # Try doing the same thing again clone( pg_repo_remote, local_repository=PG_MNT, download_all=download_all, single_image=head.image_hash[:12], ) assert len(PG_MNT.images()) == 1 # If we're downloading objects too, check only the original objects got downloaded if download_all: assert len(PG_MNT.objects.get_downloaded_objects()) == 2 # Pull the remainder of the repo PG_MNT.pull(single_image=head_1.image_hash, download_all=download_all) assert len(PG_MNT.images()) == 2 if download_all: assert len(PG_MNT.objects.get_downloaded_objects()) == 3 assert PG_MNT.images["tag_2"] == head_1 # Pull the whole repo PG_MNT.pull() assert len(PG_MNT.images()) == 3
def test_pulls_with_lazy_object_downloads(local_engine_empty, pg_repo_remote): clone(pg_repo_remote, local_repository=PG_MNT, download_all=False) # Make sure we haven't downloaded anything until checkout assert not PG_MNT.objects.get_downloaded_objects() remote_head = pg_repo_remote.head PG_MNT.images.by_hash(remote_head.image_hash).checkout() assert (len(PG_MNT.objects.get_downloaded_objects()) == 2 ) # Original fruits and vegetables tables. assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted( PG_MNT.objects.get_all_objects()) # In the meantime, make two branches off of origin (a total of 3 commits) pg_repo_remote.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") left = pg_repo_remote.commit() remote_head.checkout() pg_repo_remote.run_sql("INSERT INTO fruits VALUES (3, 'mustard')") right = pg_repo_remote.commit() # Pull from upstream. PG_MNT.pull(download_all=False) # Make sure we have the pointers to the three versions of the fruits table + the original vegetables assert len(PG_MNT.objects.get_all_objects()) == 4 # Also make sure still only have the objects with the original fruits + vegetables tables assert len(PG_MNT.objects.get_downloaded_objects()) == 2 # Check out left commit: since it only depends on the root, we should download just the new version of fruits. PG_MNT.images.by_hash(left.image_hash).checkout() assert (len(PG_MNT.objects.get_downloaded_objects()) == 3 ) # now have 2 versions of fruits + 1 vegetables PG_MNT.images.by_hash(right.image_hash).checkout() assert (len(PG_MNT.objects.get_downloaded_objects()) == 4 ) # now have 2 versions of fruits + 1 vegetables assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted( PG_MNT.objects.get_all_objects())
def test_pull_tag_overwriting(local_engine_empty, pg_repo_remote): head = pg_repo_remote.head head_1 = _add_image_to_repo(pg_repo_remote) head.tag("tag_1") head_1.tag("tag_2") head_1.tag("tag_3") pg_repo_remote.commit_engines() # Clone a single image clone( pg_repo_remote, local_repository=PG_MNT, single_image=head.image_hash[:12], ) assert len(PG_MNT.images()) == 1 assert PG_MNT.images()[0] == head assert PG_MNT.images["tag_1"] == head assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None # Clone again, check nothing has changed. clone( pg_repo_remote, local_repository=PG_MNT, single_image=head.image_hash[:12], ) assert len(PG_MNT.images()) == 1 assert PG_MNT.images["tag_1"] == head assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None # Pull the remainder of the repo PG_MNT.pull(single_image=head_1.image_hash) assert len(PG_MNT.images()) == 2 assert PG_MNT.images["tag_2"] == head_1 # Now update the tag on the remote head.tag("tag_2") pg_repo_remote.commit_engines() # Clone head again, check tag_2 wasn't overwritten (is still pointing to head_1) clone( pg_repo_remote, local_repository=PG_MNT, single_image=head.image_hash[:12], ) assert PG_MNT.images["tag_1"] == head assert PG_MNT.images["tag_2"] == head_1 assert PG_MNT.images["tag_3"] == head_1 # Clone head again, this time overwriting the tag clone( pg_repo_remote, local_repository=PG_MNT, single_image=head.image_hash[:12], overwrite_tags=True, ) assert len(PG_MNT.images()) == 2 assert PG_MNT.images["tag_1"] == head assert PG_MNT.images["tag_2"] == head assert PG_MNT.images["tag_3"] == head_1 # Update tag_3 to point to head as well head.tag("tag_3") pg_repo_remote.commit_engines() # Pull repo, check tag_3 hasn't moved. PG_MNT.pull() assert PG_MNT.images["tag_1"] == head assert PG_MNT.images["tag_2"] == head assert PG_MNT.images["tag_3"] == head_1 # Pull again overwriting all tags, check tags have moved. PG_MNT.pull(overwrite_tags=True) assert PG_MNT.images["tag_1"] == head assert PG_MNT.images["tag_2"] == head assert PG_MNT.images["tag_3"] == head
def test_push(local_engine_empty, pg_repo_remote): # Clone from the remote engine like in the previous test. clone(pg_repo_remote, local_repository=PG_MNT) remote_head = pg_repo_remote.head PG_MNT.images.by_hash(remote_head.image_hash).checkout() # Then, change our copy and commit. PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") head_1 = PG_MNT.commit() # Now, push to remote. PG_MNT.push(remote_repository=pg_repo_remote) # See if the original mountpoint got updated. assert len(pg_repo_remote.objects.get_all_objects()) == 3 pg_repo_remote.images.by_hash(head_1.image_hash).checkout() assert pg_repo_remote.run_sql("SELECT * FROM fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ] # Recommit the local image as a full snap and push it out. head_2 = PG_MNT.commit(snap_only=True) PG_MNT.push(remote_repository=pg_repo_remote) assert head_2.get_table( "fruits").objects[0] in pg_repo_remote.objects.get_all_objects() # Recommit it again, changing the sort order head_3 = PG_MNT.commit(snap_only=True, in_fragment_order={"fruits": ["name"]}, overwrite=True) assert head_3.get_table("fruits").objects == head_2.get_table( "fruits").objects assert PG_MNT.run_sql( SQL("SELECT fruit_id FROM {}.{}").format( Identifier(SPLITGRAPH_META_SCHEMA), Identifier(head_2.get_table("fruits").objects[0])), return_shape=ResultShape.MANY_ONE, ) == [1, 3, 2] # Force push overwriting object meta and the actual object PG_MNT.push( remote_repository=pg_repo_remote, single_image=head_3.image_hash, overwrite_objects=True, reupload_objects=True, ) assert pg_repo_remote.run_sql( SQL("SELECT fruit_id FROM {}.{}").format( Identifier(SPLITGRAPH_META_SCHEMA), Identifier(head_2.get_table("fruits").objects[0])), return_shape=ResultShape.MANY_ONE, ) == [1, 3, 2]
def test_s3_push_pull(local_engine_empty, unprivileged_pg_repo, pg_repo_remote_registry, clean_minio): # Test pushing/pulling when the objects are uploaded to a remote storage instead of to the actual remote DB. # In the beginning, the registry has two objects, all remote objects = pg_repo_remote_registry.objects.get_all_objects() assert len( unprivileged_pg_repo.objects.get_external_object_locations( list(objects))) == 2 assert len(objects) == 2 clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False) # Add a couple of commits, this time on the cloned copy. head = PG_MNT.images["latest"] head.checkout() PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") left = PG_MNT.commit() head.checkout() PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mustard')") right = PG_MNT.commit() # Push to origin, but this time upload the actual objects instead. PG_MNT.push(remote_repository=unprivileged_pg_repo, handler="S3", handler_options={}) # Check that the actual objects don't exist on the remote but are instead registered with an URL. # All the objects on pgcache were registered remotely objects = pg_repo_remote_registry.objects.get_all_objects() local_objects = PG_MNT.objects.get_all_objects() assert all(o in objects for o in local_objects) # Two new non-local objects in the local engine, both registered as non-local on the remote engine. ext_objects_orig = PG_MNT.objects.get_external_object_locations( list(objects)) ext_objects_pull = unprivileged_pg_repo.objects.get_external_object_locations( list(objects)) assert len(ext_objects_orig) == 4 assert all(e in ext_objects_pull for e in ext_objects_orig) # Destroy the pulled mountpoint and recreate it again. assert len(PG_MNT.objects.get_downloaded_objects()) == 4 PG_MNT.delete() # Make sure we don't have any leftover physical objects. PG_MNT.objects.cleanup() assert len(PG_MNT.objects.get_downloaded_objects()) == 0 clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False) # Proceed as per the lazy checkout tests to make sure we don't download more than required. # Make sure we still haven't downloaded anything. assert len(PG_MNT.objects.get_downloaded_objects()) == 0 # Check out left commit: since it only depends on the root, we should download just the new version of fruits. left.checkout() assert (len(PG_MNT.objects.get_downloaded_objects()) == 3 ) # now have 2 versions of fruits + 1 vegetables right.checkout() assert len(PG_MNT.objects.get_downloaded_objects()) == 4 # Only now we actually have all the objects materialized. assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted( PG_MNT.objects.get_all_objects())
def test_push_upload_error(local_engine_empty, unprivileged_pg_repo, pg_repo_remote_registry, clean_minio, interrupted): clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False) PG_MNT.images["latest"].checkout() PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") PG_MNT.run_sql("INSERT INTO vegetables VALUES (3, 'cucumber')") head = PG_MNT.commit() # If the upload fails for whatever reason (e.g. Minio is inaccessible or the upload was aborted), # the whole push fails rather than leaving the registry in an inconsistent state. with patch.dict( "splitgraph.hooks.external_objects._EXTERNAL_OBJECT_HANDLERS", {"S3": _flaky_handler(incomplete=interrupted)}, ): with pytest.raises(Exception) as e: PG_MNT.push(remote_repository=unprivileged_pg_repo, handler="S3", handler_options={}) assert head not in unprivileged_pg_repo.images # Only the two original tables from the original image upstream assert (pg_repo_remote_registry.engine.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.tables", return_shape=ResultShape.ONE_ONE) == 2) # Registry had 2 objects before the upload -- if we interrupted the upload, # we only managed to upload the first object that was registered (even if the image # wasn't). expected_object_count = 3 if interrupted else 2 assert len(pg_repo_remote_registry.objects.get_all_objects() ) == expected_object_count # Two new objects not registered remotely since the upload failed assert (local_engine_empty.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_locations", return_shape=ResultShape.ONE_ONE, ) == expected_object_count) assert (pg_repo_remote_registry.engine.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_locations", return_shape=ResultShape.ONE_ONE, ) == expected_object_count) # Now do the push normally and check the image exists upstream. PG_MNT.push(remote_repository=unprivileged_pg_repo, handler="S3", handler_options={}) assert any(i.image_hash == head.image_hash for i in unprivileged_pg_repo.images) assert len(pg_repo_remote_registry.objects.get_all_objects()) == 4 assert (local_engine_empty.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_locations", return_shape=ResultShape.ONE_ONE, ) == 4) assert (pg_repo_remote_registry.engine.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_locations", return_shape=ResultShape.ONE_ONE, ) == 4)