def test_push_own_delete_own(local_engine_empty, unprivileged_pg_repo): destination = Repository.from_template(unprivileged_pg_repo, engine=local_engine_empty) clone(unprivileged_pg_repo, local_repository=destination) destination.images["latest"].checkout() destination.run_sql( """UPDATE fruits SET name = 'banana' WHERE fruit_id = 1""") destination.commit() # Test we can push to our namespace -- can't upload the object to the splitgraph_meta since we can't create # tables there remote_destination = Repository.from_template( destination, namespace=unprivileged_pg_repo.engine.conn_params["SG_NAMESPACE"], engine=unprivileged_pg_repo.engine, ) destination.upstream = remote_destination destination.push(handler="S3") # Test we can delete a single image from our own repo assert len(remote_destination.images()) == 3 remote_destination.images.delete([destination.images["latest"].image_hash]) assert len(remote_destination.images()) == 2 # Test we can delete our own repo once we've pushed it remote_destination.delete() assert len(remote_destination.images()) == 0
def test_pull_download_error(local_engine_empty, unprivileged_pg_repo, clean_minio, interrupted): # Same test backwards: if we're pulling and abort or fail the download, make sure we can # recover and retry pulling the repo. with patch.dict( "splitgraph.hooks.external_objects._EXTERNAL_OBJECT_HANDLERS", {"S3": _flaky_handler(interrupted)}, ): with pytest.raises(Exception) as e: clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=True) # Check that the pull succeeded (repository registered locally) but the objects # are just marked as external, not downloaded assert repository_exists(PG_MNT) assert len(PG_MNT.objects.get_all_objects()) == 2 assert len(PG_MNT.objects.get_downloaded_objects()) == 1 assert len( PG_MNT.objects.get_external_object_locations( PG_MNT.objects.get_all_objects())) == 2 assert (PG_MNT.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_cache_status", return_shape=ResultShape.ONE_ONE, ) == 1) clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=True) assert len(PG_MNT.objects.get_all_objects()) == 2 assert len(PG_MNT.objects.get_downloaded_objects()) == 2 assert len(list(PG_MNT.images)) == 2 assert (PG_MNT.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_cache_status", return_shape=ResultShape.ONE_ONE, ) == 2)
def test_pull(local_engine_empty, pg_repo_remote, download_all): # Pull the schema from the remote # Here, it's the pg on local_engine that connects to the remote engine, so we can use the actual hostname # (as opposed to the one exposed to us). However, the clone procedure also uses that connection string to talk to # the remote. Hence, there's an /etc/hosts indirection on the host mapping the remote engine to localhost. clone(pg_repo_remote, local_repository=PG_MNT, download_all=download_all) PG_MNT.images.by_hash(pg_repo_remote.head.image_hash).checkout() head_1 = _add_image_to_repo(pg_repo_remote) # Check the data is unchanged on the pulled one. assert PG_MNT.run_sql("SELECT * FROM fruits") == [(1, "apple"), (2, "orange")] with pytest.raises(ImageNotFoundError): PG_MNT.images.by_hash(head_1.image_hash) PG_MNT.pull() head_1 = PG_MNT.images.by_hash(head_1.image_hash) # Check out the newly-pulled commit and verify it has the same data. head_1.checkout() assert PG_MNT.run_sql("SELECT * FROM fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ] assert PG_MNT.head == head_1
def clone_c(remote_repository_or_image, local_repository, remote, download_all, overwrite_object_meta, tags): """ Clone a remote Splitgraph repository/image into a local one. The lookup path for the repository is governed by the ``SG_REPO_LOOKUP`` and ``SG_REPO_LOOKUP_OVERRIDE`` config parameters and can be overridden by the command line ``--remote`` option. """ from splitgraph.core.repository import Repository from splitgraph.engine import get_engine from splitgraph.core.repository import clone remote_repository, image = remote_repository_or_image # If the user passed in a remote, we can inject that into the repository spec. # Otherwise, we have to turn the repository into a string and let clone() look up the # actual engine the repository lives on. if remote: remote_repository = Repository.from_template(remote_repository, engine=get_engine(remote)) else: remote_repository = remote_repository.to_schema() clone( remote_repository, local_repository=local_repository, download_all=download_all, single_image=image, overwrite_objects=overwrite_object_meta, overwrite_tags=tags, )
def test_s3_presigned_url(local_engine_empty, unprivileged_pg_repo, clean_minio): # Test the URL signing stored procedure works on the remote machine clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False) PG_MNT.images["latest"].checkout() PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") head = PG_MNT.commit() object_id = head.get_table("fruits").objects[0] # Do a test calling the signer locally (the tests currently have access # to the S3 credentials on the host they're running on) urls_local = get_object_upload_urls("%s:%s" % (S3_HOST, S3_PORT), [object_id]) assert len(urls_local) == 1 assert len(urls_local[0]) == 3 urls_local = get_object_download_urls("%s:%s" % (S3_HOST, S3_PORT), [object_id]) assert len(urls_local) == 1 assert len(urls_local[0]) == 3 urls = unprivileged_pg_repo.engine.run_sql( "SELECT * FROM splitgraph_api.get_object_upload_urls(%s, %s)", ("%s:%s" % (S3_HOST, S3_PORT), [object_id]), return_shape=ResultShape.ONE_ONE, ) assert len(urls) == 1 assert len(urls[0]) == 3
def _get_local_image_for_import(hash_or_tag: str, repository: Repository) -> Tuple[Image, bool]: """ Converts a remote repository and tag into an Image object that exists on the engine, optionally pulling the repository or cloning it into a temporary location. :param hash_or_tag: Hash/tag :param repository: Name of the repository (doesn't need to be local) :return: Image object and a boolean flag showing whether the repository should be deleted when the image is no longer needed. """ tmp_repo = Repository(repository.namespace, repository.repository + "_tmp_clone") repo_is_temporary = False logging.info("Resolving repository %s", repository) source_repo = lookup_repository(repository.to_schema(), include_local=True) if source_repo.engine.name != "LOCAL": clone(source_repo, local_repository=tmp_repo, download_all=False) source_image = tmp_repo.images[hash_or_tag] repo_is_temporary = True else: # For local repositories, first try to pull them to see if they are clones of a remote. if source_repo.upstream: source_repo.pull() source_image = source_repo.images[hash_or_tag] return source_image, repo_is_temporary
def _setup_object_cache_test(pg_repo_remote, longer_chain=False): pg_repo_local = clone(pg_repo_remote) pg_repo_local.images["latest"].checkout() prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True) if longer_chain: pg_repo_local.run_sql("INSERT INTO FRUITS VALUES (4, 'kumquat')") pg_repo_local.commit() # Same setup as the LQ test in the beginning: we clone a repo from upstream, don't download anything, all # objects are on Minio. remote = pg_repo_local.push(handler="S3", handler_options={}) pg_repo_local.delete() pg_repo_remote.objects.delete_objects( remote.objects.get_downloaded_objects()) pg_repo_remote.commit_engines() pg_repo_local.objects.cleanup() pg_repo_local = clone(pg_repo_remote, download_all=False) # 6 objects in the tree (original fragment, new base fragment and a patch on top of that fragment # for both tables) assert len(pg_repo_local.objects.get_all_objects() ) == 6 if not longer_chain else 7 assert len(pg_repo_local.objects.get_downloaded_objects()) == 0 assert len( remote.objects.get_all_objects()) == 6 if not longer_chain else 7 assert len(remote.objects.get_downloaded_objects()) == 0 # Nothing has yet been downloaded (cache entries only for externally downloaded things) assert (len( pg_repo_local.engine.run_sql( "SELECT * FROM splitgraph_meta.object_cache_status")) == 0) return pg_repo_local
def test_push(local_engine_empty, pg_repo_remote): # Clone from the remote engine like in the previous test. clone(pg_repo_remote, local_repository=PG_MNT) remote_head = pg_repo_remote.head PG_MNT.images.by_hash(remote_head.image_hash).checkout() # Then, change our copy and commit. PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") head_1 = PG_MNT.commit() # Now, push to remote. PG_MNT.push(remote_repository=pg_repo_remote) # See if the original mountpoint got updated. assert len(pg_repo_remote.objects.get_all_objects()) == 3 pg_repo_remote.images.by_hash(head_1.image_hash).checkout() assert pg_repo_remote.run_sql("SELECT * FROM fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ] # Recommit the local image as a full snap and push it out. head_2 = PG_MNT.commit(snap_only=True) PG_MNT.push(remote_repository=pg_repo_remote) assert head_2.get_table( "fruits").objects[0] in pg_repo_remote.objects.get_all_objects() # Recommit it again, changing the sort order head_3 = PG_MNT.commit(snap_only=True, in_fragment_order={"fruits": ["name"]}, overwrite=True) assert head_3.get_table("fruits").objects == head_2.get_table( "fruits").objects assert PG_MNT.run_sql( SQL("SELECT fruit_id FROM {}.{}").format( Identifier(SPLITGRAPH_META_SCHEMA), Identifier(head_2.get_table("fruits").objects[0])), return_shape=ResultShape.MANY_ONE, ) == [1, 3, 2] # Force push overwriting object meta and the actual object PG_MNT.push( remote_repository=pg_repo_remote, single_image=head_3.image_hash, overwrite_objects=True, reupload_objects=True, ) assert pg_repo_remote.run_sql( SQL("SELECT fruit_id FROM {}.{}").format( Identifier(SPLITGRAPH_META_SCHEMA), Identifier(head_2.get_table("fruits").objects[0])), return_shape=ResultShape.MANY_ONE, ) == [1, 3, 2]
def readonly_pg_repo(unprivileged_remote_engine, pg_repo_remote_registry): target = Repository.from_template(pg_repo_remote_registry, namespace=READONLY_NAMESPACE) clone(pg_repo_remote_registry, target) pg_repo_remote_registry.delete(uncheckout=False) pg_repo_remote_registry.engine.run_sql( "UPDATE splitgraph_meta.objects SET namespace=%s WHERE namespace=%s", (READONLY_NAMESPACE, REMOTE_NAMESPACE), ) pg_repo_remote_registry.engine.commit() yield Repository.from_template(target, engine=unprivileged_remote_engine)
def test_pull_single_image(local_engine_empty, pg_repo_remote, download_all): head = pg_repo_remote.head head_1 = _add_image_to_repo(pg_repo_remote) head.tag("tag_1") head_1.tag("tag_2") pg_repo_remote.commit_engines() # Clone a single image first assert len(PG_MNT.images()) == 0 assert len(PG_MNT.objects.get_downloaded_objects()) == 0 assert len(pg_repo_remote.images()) == 3 clone( pg_repo_remote, local_repository=PG_MNT, download_all=download_all, single_image=head.image_hash[:12], ) # Check only one image got downloaded and check we didn't try # to pull tags for images that we weren't pulling. assert len(PG_MNT.images()) == 1 assert PG_MNT.images()[0] == head assert PG_MNT.images["tag_1"] == head assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None # Try doing the same thing again clone( pg_repo_remote, local_repository=PG_MNT, download_all=download_all, single_image=head.image_hash[:12], ) assert len(PG_MNT.images()) == 1 # If we're downloading objects too, check only the original objects got downloaded if download_all: assert len(PG_MNT.objects.get_downloaded_objects()) == 2 # Pull the remainder of the repo PG_MNT.pull(single_image=head_1.image_hash, download_all=download_all) assert len(PG_MNT.images()) == 2 if download_all: assert len(PG_MNT.objects.get_downloaded_objects()) == 3 assert PG_MNT.images["tag_2"] == head_1 # Pull the whole repo PG_MNT.pull() assert len(PG_MNT.images()) == 3
def _execute_from( node: Node, output: Repository) -> Tuple[Repository, Optional[ProvenanceLine]]: interesting_nodes = extract_nodes(node, ["repo_source", "repository"]) repo_source = get_first_or_none(interesting_nodes, "repo_source") output_node = get_first_or_none(interesting_nodes, "repository") provenance: Optional[ProvenanceLine] = None if output_node: # AS (output) detected, change the current output repository to it. output = Repository.from_schema(output_node.match.group(0)) logging.info("Changed output repository to %s" % str(output)) # NB this destroys all data in the case where we ran some commands in the Splitfile and then # did FROM (...) without AS repository if repository_exists(output): logging.info("Clearing all output from %s" % str(output)) output.delete() if not repository_exists(output): output.init() if repo_source: repository, tag_or_hash = parse_image_spec(repo_source) source_repo = lookup_repository(repository.to_schema(), include_local=True) if source_repo.engine.name == "LOCAL": # For local repositories, make sure to update them if they've an upstream if source_repo.upstream: source_repo.pull() # Get the target image hash from the source repo: otherwise, if the tag is, say, 'latest' and # the output has just had the base commit (000...) created in it, that commit will be the latest. clone(source_repo, local_repository=output, download_all=False) source_hash = source_repo.images[tag_or_hash].image_hash output.images.by_hash(source_hash).checkout() provenance = { "type": "FROM", "source_namespace": source_repo.namespace, "source": source_repo.repository, "source_hash": source_hash, } else: # FROM EMPTY AS repository -- initializes an empty repository (say to create a table or import # the results of a previous stage in a multistage build. # In this case, if AS repository has been specified, it's already been initialized. If not, this command # literally does nothing if not output_node: raise SplitfileError( "FROM EMPTY without AS (repository) does nothing!") return output, provenance
def test_lq_qual_filtering(local_engine_empty, unprivileged_pg_repo, clean_minio, test_case): # Test that LQ prunes the object list based on quals # We can't really see that directly, so we check to see which objects it tries to download. _prepare_fully_remote_repo(local_engine_empty, unprivileged_pg_repo) pg_repo_local = clone(unprivileged_pg_repo, download_all=False) pg_repo_local.images["latest"].checkout(layered=True) assert len(pg_repo_local.objects.get_downloaded_objects()) == 0 query, expected, object_mask = test_case required_objects = pg_repo_local.head.get_table("fruits").objects assert len(required_objects) == 5 assert required_objects == [ # Initial fragment "of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5", # INS (3, mayonnaise) "of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b", # DEL (1, apple) "o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d", # UPS (2, guitar), replaces (2, orange) "o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc", # INS (4, kumquat) "oc27ee277aff108525a2df043d9efdaa1c3e26a4949a6cf6b53ee0c889c8559", ] expected_objects = [o for o, m in zip(required_objects, object_mask) if m] assert pg_repo_local.run_sql(query) == expected used_objects = pg_repo_local.objects.get_downloaded_objects() assert set(expected_objects) == set(used_objects)
def test_push_own_delete_own_different_namespaces(local_engine_empty, readonly_pg_repo): # Same as previous but we clone the read-only repo and push to our own namespace # to check that the objects we push get their namespaces rewritten to be the unprivileged user, not test. destination = clone(readonly_pg_repo) destination.images["latest"].checkout() destination.run_sql( """UPDATE fruits SET name = 'banana' WHERE fruit_id = 1""") destination.commit() remote_destination = Repository.from_template( readonly_pg_repo, namespace=readonly_pg_repo.engine.conn_params["SG_NAMESPACE"], engine=readonly_pg_repo.engine, ) destination.upstream = remote_destination destination.push(handler="S3") object_id = destination.head.get_table("fruits").objects[-1] assert (remote_destination.objects.get_object_meta([object_id ])[object_id].namespace == readonly_pg_repo.engine.conn_params["SG_NAMESPACE"]) # Test we can delete our own repo once we've pushed it remote_destination.delete(uncheckout=False) assert len(remote_destination.images()) == 0
def test_lq_remote(local_engine_empty, pg_repo_remote): # Test layered querying works when we initialize it on a cloned repo that doesn't have any # cached objects (all are on the remote). # 1 patch on top of fruits, 1 patch on top of vegetables prepare_lq_repo(pg_repo_remote, commit_after_every=False, include_pk=True) pg_repo_local = clone(pg_repo_remote, download_all=False) _test_lazy_lq_checkout(pg_repo_local)
def test_push_others(local_engine_empty, readonly_pg_repo): destination = clone(readonly_pg_repo) destination.images["latest"].checkout() destination.run_sql( """UPDATE fruits SET name = 'banana' WHERE fruit_id = 1""") destination.commit() with pytest.raises(ProgrammingError) as e: destination.push(remote_repository=readonly_pg_repo, handler="S3") assert "You do not have access to this namespace!" in str(e.value)
def _prepare_fully_remote_repo(local_engine_empty, pg_repo_remote_registry): # Setup: same as external, with an extra patch on top of the fruits table. pg_repo_local = clone(pg_repo_remote_registry) pg_repo_local.images["latest"].checkout() prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'kumquat')") pg_repo_local.commit() pg_repo_local.push(handler="S3", handler_options={}) pg_repo_local.delete() pg_repo_local.objects.cleanup() pg_repo_local.commit_engines()
def test_pulls_with_lazy_object_downloads(local_engine_empty, pg_repo_remote): clone(pg_repo_remote, local_repository=PG_MNT, download_all=False) # Make sure we haven't downloaded anything until checkout assert not PG_MNT.objects.get_downloaded_objects() remote_head = pg_repo_remote.head PG_MNT.images.by_hash(remote_head.image_hash).checkout() assert (len(PG_MNT.objects.get_downloaded_objects()) == 2 ) # Original fruits and vegetables tables. assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted( PG_MNT.objects.get_all_objects()) # In the meantime, make two branches off of origin (a total of 3 commits) pg_repo_remote.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") left = pg_repo_remote.commit() remote_head.checkout() pg_repo_remote.run_sql("INSERT INTO fruits VALUES (3, 'mustard')") right = pg_repo_remote.commit() # Pull from upstream. PG_MNT.pull(download_all=False) # Make sure we have the pointers to the three versions of the fruits table + the original vegetables assert len(PG_MNT.objects.get_all_objects()) == 4 # Also make sure still only have the objects with the original fruits + vegetables tables assert len(PG_MNT.objects.get_downloaded_objects()) == 2 # Check out left commit: since it only depends on the root, we should download just the new version of fruits. PG_MNT.images.by_hash(left.image_hash).checkout() assert (len(PG_MNT.objects.get_downloaded_objects()) == 3 ) # now have 2 versions of fruits + 1 vegetables PG_MNT.images.by_hash(right.image_hash).checkout() assert (len(PG_MNT.objects.get_downloaded_objects()) == 4 ) # now have 2 versions of fruits + 1 vegetables assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted( PG_MNT.objects.get_all_objects())
def test_lq_external(local_engine_empty, unprivileged_pg_repo, pg_repo_remote_registry, clean_minio): # Test layered querying works when we initialize it on a cloned repo that doesn't have any # cached objects (all are on S3 or other external location). pg_repo_local = clone(unprivileged_pg_repo) pg_repo_local.images["latest"].checkout() prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True) # Setup: upstream has the same repository as in the previous test but with no cached objects (all are external). # In addition, we check that LQ works against an unprivileged upstream (where we don't actually have # admin access). pg_repo_local.push(unprivileged_pg_repo, handler="S3", handler_options={}) pg_repo_local.delete() pg_repo_local.objects.cleanup() assert len(pg_repo_local.objects.get_all_objects()) == 0 assert len(pg_repo_local.objects.get_downloaded_objects()) == 0 assert len(pg_repo_remote_registry.objects.get_all_objects()) == 6 # Proceed as per the previous test pg_repo_local = clone(unprivileged_pg_repo, download_all=False) _test_lazy_lq_checkout(pg_repo_local)
def test_import_updating_splitfile_with_uploading(local_engine_empty, remote_engine, pg_repo_remote): execute_commands(load_splitfile("import_and_update.splitfile"), output=OUTPUT) head = OUTPUT.head assert len(OUTPUT.objects.get_all_objects() ) == 4 # Two original tables + two updates # Push with upload. Have to specify the remote repo. remote_output = Repository(OUTPUT.namespace, OUTPUT.repository, remote_engine) OUTPUT.push(remote_output, handler="S3", handler_options={}) # Unmount everything locally and cleanup OUTPUT.delete() # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object # manager for the engine (maybe should inject it into local_engine/remote_engine instead) OUTPUT.objects.cleanup() assert not OUTPUT.objects.get_all_objects() clone(OUTPUT.to_schema(), download_all=False) assert not OUTPUT.objects.get_downloaded_objects() existing_objects = list(OUTPUT.objects.get_all_objects()) assert len(existing_objects) == 4 # Two original tables + two updates # Only 2 objects are stored externally (the other two have been on the remote the whole time) assert len( OUTPUT.objects.get_external_object_locations(existing_objects)) == 2 head.checkout() assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ]
def test_lq_single_non_snap_object(local_engine_empty, unprivileged_pg_repo, clean_minio): # The object produced by # "DELETE FROM vegetables WHERE vegetable_id = 1;INSERT INTO vegetables VALUES (3, 'celery')" # has a deletion and an insertion. Check that an LQ that only uses that object # doesn't return the extra upserted/deleted flag column. _prepare_fully_remote_repo(local_engine_empty, unprivileged_pg_repo) pg_repo_local = clone(unprivileged_pg_repo, download_all=False) pg_repo_local.images["latest"].checkout(layered=True) assert pg_repo_local.run_sql( "SELECT * FROM vegetables WHERE vegetable_id = 3 AND name = 'celery'" ) == [(3, "celery")] used_objects = pg_repo_local.objects.get_downloaded_objects() assert len(used_objects) == 1
def init_repo(self, repo_info: RepoInfo) -> Repository: repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) if not repository_exists(repo): self.logger.info("Creating repo {}/{}...".format( repo.namespace, repo.repository)) repo.init() if repo_info.remote_name: remote = Repository.from_template(repo, engine=get_engine( repo_info.remote_name)) cloned_repo = clone( remote, local_repository=repo, download_all=False, overwrite_objects=True, overwrite_tags=True, ) return repo
def read(self, location: str) -> Result: new = self.copy() new.location = location try: repo = Repository(namespace=new.repo_info.namespace, repository=new.repo_info.repository) remote = Repository.from_template(repo, engine=get_engine( new.repo_info.remote_name, autocommit=True)) cloned_repo = clone( remote, local_repository=repo, download_all=True, overwrite_objects=True, overwrite_tags=True, single_image=new.repo_info.tag, ) data = sql_to_df(f"SELECT * FROM {new.repo_info.table}", repository=cloned_repo, use_lq=self.layer_query) if self.schema is not None: errors = self.schema.validate(data) if errors: raise SchemaValidationError(errors) new.value = data except Exception as exc: self.logger.exception( "Unexpected error while reading from result handler: {}". format(repr(exc))) raise exc return new
def test_bloom_reindex_push(local_engine_empty, unprivileged_pg_repo, clean_minio): _prepare_fully_remote_repo(local_engine_empty, unprivileged_pg_repo) pg_repo_local = clone(unprivileged_pg_repo, download_all=False) # Do a reindex on the local engine and push the dataset back out. fruits = pg_repo_local.images["latest"].get_table("fruits") # The repo used for LQ tests has 2 objects that overwrite data, so we ignore those. reindexed = fruits.reindex( extra_indexes={"bloom": { "name": { "probability": 0.01 } }}, raise_on_patch_objects=False) pg_repo_local.commit_engines() # Push back out overwriting object metadata pg_repo_local.push(overwrite_objects=True, single_image="latest") # Check the index was written to the registry. assert ("bloom" in unprivileged_pg_repo.objects.get_object_meta(reindexed)[ reindexed[0]].object_index)
def test_push_upload_error(local_engine_empty, unprivileged_pg_repo, pg_repo_remote_registry, clean_minio, interrupted): clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False) PG_MNT.images["latest"].checkout() PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") PG_MNT.run_sql("INSERT INTO vegetables VALUES (3, 'cucumber')") head = PG_MNT.commit() # If the upload fails for whatever reason (e.g. Minio is inaccessible or the upload was aborted), # the whole push fails rather than leaving the registry in an inconsistent state. with patch.dict( "splitgraph.hooks.external_objects._EXTERNAL_OBJECT_HANDLERS", {"S3": _flaky_handler(incomplete=interrupted)}, ): with pytest.raises(Exception) as e: PG_MNT.push(remote_repository=unprivileged_pg_repo, handler="S3", handler_options={}) assert head not in unprivileged_pg_repo.images # Only the two original tables from the original image upstream assert (pg_repo_remote_registry.engine.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.tables", return_shape=ResultShape.ONE_ONE) == 2) # Registry had 2 objects before the upload -- if we interrupted the upload, # we only managed to upload the first object that was registered (even if the image # wasn't). expected_object_count = 3 if interrupted else 2 assert len(pg_repo_remote_registry.objects.get_all_objects() ) == expected_object_count # Two new objects not registered remotely since the upload failed assert (local_engine_empty.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_locations", return_shape=ResultShape.ONE_ONE, ) == expected_object_count) assert (pg_repo_remote_registry.engine.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_locations", return_shape=ResultShape.ONE_ONE, ) == expected_object_count) # Now do the push normally and check the image exists upstream. PG_MNT.push(remote_repository=unprivileged_pg_repo, handler="S3", handler_options={}) assert any(i.image_hash == head.image_hash for i in unprivileged_pg_repo.images) assert len(pg_repo_remote_registry.objects.get_all_objects()) == 4 assert (local_engine_empty.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_locations", return_shape=ResultShape.ONE_ONE, ) == 4) assert (pg_repo_remote_registry.engine.run_sql( "SELECT COUNT(*) FROM splitgraph_meta.object_locations", return_shape=ResultShape.ONE_ONE, ) == 4)
def test_pull_tag_overwriting(local_engine_empty, pg_repo_remote): head = pg_repo_remote.head head_1 = _add_image_to_repo(pg_repo_remote) head.tag("tag_1") head_1.tag("tag_2") head_1.tag("tag_3") pg_repo_remote.commit_engines() # Clone a single image clone( pg_repo_remote, local_repository=PG_MNT, single_image=head.image_hash[:12], ) assert len(PG_MNT.images()) == 1 assert PG_MNT.images()[0] == head assert PG_MNT.images["tag_1"] == head assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None # Clone again, check nothing has changed. clone( pg_repo_remote, local_repository=PG_MNT, single_image=head.image_hash[:12], ) assert len(PG_MNT.images()) == 1 assert PG_MNT.images["tag_1"] == head assert PG_MNT.images.by_tag("tag_2", raise_on_none=False) is None # Pull the remainder of the repo PG_MNT.pull(single_image=head_1.image_hash) assert len(PG_MNT.images()) == 2 assert PG_MNT.images["tag_2"] == head_1 # Now update the tag on the remote head.tag("tag_2") pg_repo_remote.commit_engines() # Clone head again, check tag_2 wasn't overwritten (is still pointing to head_1) clone( pg_repo_remote, local_repository=PG_MNT, single_image=head.image_hash[:12], ) assert PG_MNT.images["tag_1"] == head assert PG_MNT.images["tag_2"] == head_1 assert PG_MNT.images["tag_3"] == head_1 # Clone head again, this time overwriting the tag clone( pg_repo_remote, local_repository=PG_MNT, single_image=head.image_hash[:12], overwrite_tags=True, ) assert len(PG_MNT.images()) == 2 assert PG_MNT.images["tag_1"] == head assert PG_MNT.images["tag_2"] == head assert PG_MNT.images["tag_3"] == head_1 # Update tag_3 to point to head as well head.tag("tag_3") pg_repo_remote.commit_engines() # Pull repo, check tag_3 hasn't moved. PG_MNT.pull() assert PG_MNT.images["tag_1"] == head assert PG_MNT.images["tag_2"] == head assert PG_MNT.images["tag_3"] == head_1 # Pull again overwriting all tags, check tags have moved. PG_MNT.pull(overwrite_tags=True) assert PG_MNT.images["tag_1"] == head assert PG_MNT.images["tag_2"] == head assert PG_MNT.images["tag_3"] == head
def test_s3_push_pull(local_engine_empty, unprivileged_pg_repo, pg_repo_remote_registry, clean_minio): # Test pushing/pulling when the objects are uploaded to a remote storage instead of to the actual remote DB. # In the beginning, the registry has two objects, all remote objects = pg_repo_remote_registry.objects.get_all_objects() assert len( unprivileged_pg_repo.objects.get_external_object_locations( list(objects))) == 2 assert len(objects) == 2 clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False) # Add a couple of commits, this time on the cloned copy. head = PG_MNT.images["latest"] head.checkout() PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')") left = PG_MNT.commit() head.checkout() PG_MNT.run_sql("INSERT INTO fruits VALUES (3, 'mustard')") right = PG_MNT.commit() # Push to origin, but this time upload the actual objects instead. PG_MNT.push(remote_repository=unprivileged_pg_repo, handler="S3", handler_options={}) # Check that the actual objects don't exist on the remote but are instead registered with an URL. # All the objects on pgcache were registered remotely objects = pg_repo_remote_registry.objects.get_all_objects() local_objects = PG_MNT.objects.get_all_objects() assert all(o in objects for o in local_objects) # Two new non-local objects in the local engine, both registered as non-local on the remote engine. ext_objects_orig = PG_MNT.objects.get_external_object_locations( list(objects)) ext_objects_pull = unprivileged_pg_repo.objects.get_external_object_locations( list(objects)) assert len(ext_objects_orig) == 4 assert all(e in ext_objects_pull for e in ext_objects_orig) # Destroy the pulled mountpoint and recreate it again. assert len(PG_MNT.objects.get_downloaded_objects()) == 4 PG_MNT.delete() # Make sure we don't have any leftover physical objects. PG_MNT.objects.cleanup() assert len(PG_MNT.objects.get_downloaded_objects()) == 0 clone(unprivileged_pg_repo, local_repository=PG_MNT, download_all=False) # Proceed as per the lazy checkout tests to make sure we don't download more than required. # Make sure we still haven't downloaded anything. assert len(PG_MNT.objects.get_downloaded_objects()) == 0 # Check out left commit: since it only depends on the root, we should download just the new version of fruits. left.checkout() assert (len(PG_MNT.objects.get_downloaded_objects()) == 3 ) # now have 2 versions of fruits + 1 vegetables right.checkout() assert len(PG_MNT.objects.get_downloaded_objects()) == 4 # Only now we actually have all the objects materialized. assert sorted(PG_MNT.objects.get_downloaded_objects()) == sorted( PG_MNT.objects.get_all_objects())
def test_pull_public(local_engine_empty, readonly_pg_repo): clone(readonly_pg_repo)