def test_splitfile_end_to_end_with_uploading(local_engine_empty, remote_engine, pg_repo_remote_multitag, mg_repo_remote, clean_minio): # An end-to-end test: # * Create a derived dataset from some tables imported from the remote engine # * Push it back to the remote engine, uploading all objects to S3 (instead of the remote engine itself) # * Delete everything from pgcache # * Run another splitfile that depends on the just-pushed dataset (and does lazy checkouts to # get the required tables). # Do the same setting up first and run the splitfile against the remote data. execute_commands(load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v1"}, output=OUTPUT) remote_output = Repository(OUTPUT.namespace, OUTPUT.repository, remote_engine) # Push with upload OUTPUT.push(remote_repository=remote_output, handler="S3", handler_options={}) # Unmount everything locally and cleanup for mountpoint, _ in get_current_repositories(local_engine_empty): mountpoint.delete() OUTPUT.objects.cleanup() stage_2 = R("output_stage_2") execute_commands( load_splitfile("import_from_preuploaded_remote.splitfile"), output=stage_2) assert stage_2.run_sql("SELECT id, name, fruit, vegetable FROM diet") == [ (2, "James", "orange", "carrot") ]
def test_import_updating_splitfile_with_uploading(local_engine_empty, remote_engine, pg_repo_remote): execute_commands(load_splitfile("import_and_update.splitfile"), output=OUTPUT) head = OUTPUT.head assert len(OUTPUT.objects.get_all_objects() ) == 4 # Two original tables + two updates # Push with upload. Have to specify the remote repo. remote_output = Repository(OUTPUT.namespace, OUTPUT.repository, remote_engine) OUTPUT.push(remote_output, handler="S3", handler_options={}) # Unmount everything locally and cleanup OUTPUT.delete() # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object # manager for the engine (maybe should inject it into local_engine/remote_engine instead) OUTPUT.objects.cleanup() assert not OUTPUT.objects.get_all_objects() clone(OUTPUT.to_schema(), download_all=False) assert not OUTPUT.objects.get_downloaded_objects() existing_objects = list(OUTPUT.objects.get_all_objects()) assert len(existing_objects) == 4 # Two original tables + two updates # Only 2 objects are stored externally (the other two have been on the remote the whole time) assert len( OUTPUT.objects.get_external_object_locations(existing_objects)) == 2 head.checkout() assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [ (1, "apple"), (2, "orange"), (3, "mayonnaise"), ]