def test_disjoint_table_lq_one_singleton(pg_repo_local): # Test querying tables that have multiple single chunks that don't overlap each other. # Those must be queried directly without being applied to a staging area. prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) fruits = pg_repo_local.images["latest"].get_table("fruits") # Quick sanity checks/assertions to show which chunks in the table overlap which. assert _get_chunk_groups(fruits) == [ [ # Group 1: original two rows (PKs 1 and 2)... ("of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5", (1,), (2,)), # ...then deletion of 'apple' (PK 1) ("o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d", (1,), (1,)), # ...then update PK 2 to 'guitar' ("o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc", (2,), (2,)), ], # Group 2: even though this insertion happened first, it's separated out # as it can be applied independently. [("of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b", (3,), (3,))], ] # Run query that only touches the chunk with pk=3: since we skip over the chunks in the first group, # we aren't supposed to call apply_fragments and just query the oa32... chunk directly. with mock.patch.object( PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments ) as apply_fragments: assert list( fruits.query(columns=["fruit_id", "name"], quals=[[("fruit_id", "=", "3")]]) ) == [{"fruit_id": 3, "name": "mayonnaise"}] assert apply_fragments.call_count == 0
def _setup_object_cache_test(pg_repo_remote, longer_chain=False): pg_repo_local = clone(pg_repo_remote) pg_repo_local.images["latest"].checkout() prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True) if longer_chain: pg_repo_local.run_sql("INSERT INTO FRUITS VALUES (4, 'kumquat')") pg_repo_local.commit() # Same setup as the LQ test in the beginning: we clone a repo from upstream, don't download anything, all # objects are on Minio. remote = pg_repo_local.push(handler="S3", handler_options={}) pg_repo_local.delete() pg_repo_remote.objects.delete_objects( remote.objects.get_downloaded_objects()) pg_repo_remote.commit_engines() pg_repo_local.objects.cleanup() pg_repo_local = clone(pg_repo_remote, download_all=False) # 6 objects in the tree (original fragment, new base fragment and a patch on top of that fragment # for both tables) assert len(pg_repo_local.objects.get_all_objects() ) == 6 if not longer_chain else 7 assert len(pg_repo_local.objects.get_downloaded_objects()) == 0 assert len( remote.objects.get_all_objects()) == 6 if not longer_chain else 7 assert len(remote.objects.get_downloaded_objects()) == 0 # Nothing has yet been downloaded (cache entries only for externally downloaded things) assert (len( pg_repo_local.engine.run_sql( "SELECT * FROM splitgraph_meta.object_cache_status")) == 0) return pg_repo_local
def test_lq_remote(local_engine_empty, pg_repo_remote): # Test layered querying works when we initialize it on a cloned repo that doesn't have any # cached objects (all are on the remote). # 1 patch on top of fruits, 1 patch on top of vegetables prepare_lq_repo(pg_repo_remote, commit_after_every=False, include_pk=True) pg_repo_local = clone(pg_repo_remote, download_all=False) _test_lazy_lq_checkout(pg_repo_local)
def test_splitfile_inline_sql(readonly_pg_repo, pg_repo_local): # Test SQL commands accessing repos directly -- join a remote repo with # some local data. prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True) pg_repo_local.head.tag("v2") execute_commands( load_splitfile("inline_sql.splitfile"), output=OUTPUT, ) new_head = OUTPUT.head new_head.checkout() assert new_head.get_tables() == ["balanced_diet"] assert OUTPUT.run_sql("SELECT * FROM balanced_diet") == [ (1, "apple", None, "potato"), (2, "orange", datetime.datetime(2019, 1, 1, 12, 0), "carrot"), ] local_repo_head = pg_repo_local.head.image_hash other_repo_head = readonly_pg_repo.images["latest"].image_hash assert new_head.provenance_data == [ { "sources": [ { "source": "pg_mount", "source_hash": other_repo_head, "source_namespace": "otheruser", }, { "source": "pg_mount", "source_hash": local_repo_head, "source_namespace": "test" }, ], "sql": ("CREATE TABLE balanced_diet\n" " AS SELECT fruits.fruit_id AS id\n" " , fruits.name AS fruit\n" " , my_fruits.timestamp AS timestamp\n" " , vegetables.name AS vegetable\n" " FROM " '"otheruser/pg_mount:{0}".fruits ' "AS fruits\n" " INNER JOIN " '"otheruser/pg_mount:{0}".vegetables ' "AS vegetables ON fruits.fruit_id = vegetable_id\n" " LEFT JOIN " '"test/pg_mount:{1}".fruits ' "AS my_fruits ON my_fruits.fruit_id = fruits.fruit_id;\n" "\n" "ALTER TABLE balanced_diet ADD PRIMARY KEY (id)").format( other_repo_head, local_repo_head), "type": "SQL", }, ]
def test_layered_querying_against_single_fragment(pg_repo_local): # Test the case where the query is satisfied by a single fragment. prepare_lq_repo(pg_repo_local, snap_only=True, commit_after_every=False, include_pk=True) new_head = pg_repo_local.head new_head.checkout(layered=True) assert pg_repo_local.run_sql( "SELECT * FROM fruits WHERE name IN ('guitar', 'mayonnaise') ORDER BY fruit_id" ) == [(2, "guitar", 1, _DT), (3, "mayonnaise", 1, _DT)]
def test_disjoint_table_lq_two_singletons_one_overwritten(pg_repo_local): # Add another two rows to the table with PKs 4 and 5 prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) pg_repo_local.run_sql( "INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')") pg_repo_local.commit() pg_repo_local.run_sql( "UPDATE fruits SET name = 'fruit_5_updated' WHERE fruit_id = 5") fruits = pg_repo_local.commit().get_table("fruits") assert _get_chunk_groups(fruits) == [ [ ("of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5", (1, ), (2, )), ("o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d", (1, ), (1, )), ("o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc", (2, ), (2, )), ], [("of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b", (3, ), (3, ))], # The pk=5 update has to be added to the last chunk group, making it a non-singleton [ ("oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc", (4, ), (5, )), ("o15a420721b04e9749761b5368628cb15593cb8cfdcc547107b98eddda5031d", (5, ), (5, )), ], ] with mock.patch.object( PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments) as apply_fragments: with mock.patch("splitgraph.core.table._generate_select_query", side_effect=_generate_select_query) as _gsc: assert list( fruits.query(columns=["fruit_id", "name"], quals=[[("fruit_id", ">=", "3")]])) == [ { "fruit_id": 3, "name": "mayonnaise" }, { "fruit_id": 4, "name": "fruit_4" }, { "fruit_id": 5, "name": "fruit_5_updated" }, ] # This time we had to apply the fragments in the final group (since there were two of them) _assert_fragments_applied(_gsc, apply_fragments, pg_repo_local)
def test_disjoint_table_lq_two_singletons_one_overwritten_indirect( pg_repo_local): # Now test scanning the dataset with two singletons and one non-singleton group # by consuming queries one-by-one. prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) pg_repo_local.run_sql( "INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')") fruits = pg_repo_local.commit().get_table("fruits") tables, callback, _ = fruits.query_indirect(columns=["fruit_id", "name"], quals=None) # At this point, we've "claimed" all objects but haven't done anything with them. # We're not really testing object claiming here since the objects were created locally # (see test_object_cache_deferred in test_object_cache.py for a test for claims/releases) assert len(callback) == 1 # First, we emit cstore tables assert ( next(tables) == b'"splitgraph_meta".' b'"of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"') assert len(callback) == 1 # There's another singleton we can query directly. assert ( next(tables) == b'"splitgraph_meta".' b'"oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc"') assert len(callback) == 1 # We have two fragments left to scan but they overlap each other, so they have to be materialized. with mock.patch.object( PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments) as apply_fragments: next(tables) assert apply_fragments.call_count == 1 args, _ = apply_fragments.call_args_list[0] tmp_table = args[2] # Because of this, our callback list now includes deleting the temporary table assert len(callback) == 2 # We've now exhausted the list of queries with pytest.raises(StopIteration): next(tables) # ...but haven't called the callback yet assert pg_repo_local.engine.table_exists(SPLITGRAPH_META_SCHEMA, tmp_table) # Call the callback now, deleting the temporary table. callback() assert not pg_repo_local.engine.table_exists(SPLITGRAPH_META_SCHEMA, tmp_table)
def _prepare_fully_remote_repo(local_engine_empty, pg_repo_remote_registry): # Setup: same as external, with an extra patch on top of the fruits table. pg_repo_local = clone(pg_repo_remote_registry) pg_repo_local.images["latest"].checkout() prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'kumquat')") pg_repo_local.commit() pg_repo_local.push(handler="S3", handler_options={}) pg_repo_local.delete() pg_repo_local.objects.cleanup() pg_repo_local.commit_engines()
def test_layered_querying_type_conversion(pg_repo_local): # For type bigint, Multicorn for some reason converts quals to be strings. Test we can handle that. prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True) pg_repo_local.run_sql("ALTER TABLE fruits ALTER COLUMN fruit_id TYPE bigint") pg_repo_local.commit() pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'kumquat', 42, '2018-01-02T03:04:05')") new_head = pg_repo_local.commit() new_head.checkout(layered=True) # Make sure ANY works on integers (not converted to strings) assert pg_repo_local.run_sql( "SELECT * FROM fruits WHERE fruit_id IN (3, 4) ORDER BY fruit_id" ) == [(3, "mayonnaise", 1, _DT), (4, "kumquat", 42, dt(2018, 1, 2, 3, 4, 5))]
def test_disjoint_table_lq_two_singletons(pg_repo_local): # Add another two rows to the table with PKs 4 and 5 prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')") fruits = pg_repo_local.commit().get_table("fruits") # The new fragment lands in a separate group assert _get_chunk_groups(fruits) == [ [ ("of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5", (1,), (2,)), ("o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d", (1,), (1,)), ("o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc", (2,), (2,)), ], [("of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b", (3,), (3,))], [("oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc", (4,), (5,))], ] # Query hitting PKs 3, 4 and 5: they hit single chunks that don't depend on anything, # so we still shouldn't be applying fragments. with mock.patch.object( PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments ) as apply_fragments: with mock.patch( "splitgraph.core.table._generate_select_query", side_effect=_generate_select_query ) as _gsc: assert list( fruits.query(columns=["fruit_id", "name"], quals=[[("fruit_id", ">=", "3")]]) ) == [ {"fruit_id": 3, "name": "mayonnaise"}, {"fruit_id": 4, "name": "fruit_4"}, {"fruit_id": 5, "name": "fruit_5"}, ] assert apply_fragments.call_count == 0 # Check that we generated two SELECT queries assert _gsc.mock_calls == [ call( mock.ANY, b'"splitgraph_meta"."of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"', ["fruit_id", "name"], mock.ANY, ("3",), ), call( mock.ANY, b'"splitgraph_meta"."oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc"', ["fruit_id", "name"], mock.ANY, ("3",), ), ]
def test_disjoint_table_lq_indirect(pg_repo_local): # Test querying tables indirectly (returning some tables) prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) fruits = pg_repo_local.images["latest"].get_table("fruits") result, callback, _ = fruits.query_indirect(columns=["fruit_id", "name"], quals=[[("fruit_id", "=", "3")] ]) # Check that we get a list of chunks to query and a callback to release the results assert list(result) == [ b'"splitgraph_meta"."of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"' ] assert len(callback) == 1
def test_disjoint_table_lq_temp_table_deletion_doesnt_lock_up(pg_repo_local): # When Multicorn reads from the temporary table, it does that in the context of the # transaction that it's been called from. It hence can hold a read lock on the # temporary table that it won't release until the scan is over but the scan won't # be over until we've managed to delete the temporary table, leading to a deadlock. # Check deleting the table in a separate thread works. prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) pg_repo_local.run_sql( "INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')") fruits = pg_repo_local.commit().get_table("fruits") tables, callback, plan = fruits.query_indirect( columns=["fruit_id", "name"], quals=None) # Force a materialization and get the query that makes us read from the temporary table. last_table = list(tables)[-1] pg_repo_local.commit_engines() # Simulate Multicorn reading from the table by doing it from a different engine. conn_params = _prepare_engine_config(CONFIG) engine = PostgresEngine(conn_params=conn_params, name="test_engine") query = _generate_select_query(engine, last_table, ["fruit_id", "name"]) engine.run_sql(query) logging.info("Acquired read lock") # At this point, we're holding a lock on the table and callback will lock up, unless it's run # in a separate thread. callback(from_fdw=True) # Wait for the thread to actually start (a bit gauche but otherwise makes the test flaky(ier)) time.sleep(1) # We still can read from the table (callback spawned a thread which is locked trying to delete it). engine.run_sql(query) # Now drop the lock. logging.info("Dropping the lock") engine.rollback() # If this test assertion fails, make sure we're not left holding the lock. engine.autocommit = True with pytest.raises(ObjectNotFoundError): engine.run_sql(query)
def test_lq_external(local_engine_empty, unprivileged_pg_repo, pg_repo_remote_registry, clean_minio): # Test layered querying works when we initialize it on a cloned repo that doesn't have any # cached objects (all are on S3 or other external location). pg_repo_local = clone(unprivileged_pg_repo) pg_repo_local.images["latest"].checkout() prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True) # Setup: upstream has the same repository as in the previous test but with no cached objects (all are external). # In addition, we check that LQ works against an unprivileged upstream (where we don't actually have # admin access). pg_repo_local.push(unprivileged_pg_repo, handler="S3", handler_options={}) pg_repo_local.delete() pg_repo_local.objects.cleanup() assert len(pg_repo_local.objects.get_all_objects()) == 0 assert len(pg_repo_local.objects.get_downloaded_objects()) == 0 assert len(pg_repo_remote_registry.objects.get_all_objects()) == 6 # Proceed as per the previous test pg_repo_local = clone(unprivileged_pg_repo, download_all=False) _test_lazy_lq_checkout(pg_repo_local)
def test_disjoint_table_lq_two_singletons_one_overwritten(pg_repo_local): # Add another two rows to the table with PKs 4 and 5 prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True) pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')") pg_repo_local.commit() pg_repo_local.run_sql("UPDATE fruits SET name = 'fruit_5_updated' WHERE fruit_id = 5") fruits = pg_repo_local.commit().get_table("fruits") assert _get_chunk_groups(fruits) == [ [ ("of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5", (1,), (2,)), ("o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d", (1,), (1,)), ("o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc", (2,), (2,)), ], [("of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b", (3,), (3,))], # The pk=5 update has to be added to the last chunk group, making it a non-singleton [ ("oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc", (4,), (5,)), ("o15a420721b04e9749761b5368628cb15593cb8cfdcc547107b98eddda5031d", (5,), (5,)), ], ] with mock.patch.object( PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments ) as apply_fragments: with mock.patch( "splitgraph.core.table._generate_select_query", side_effect=_generate_select_query ) as _gsc: assert list( fruits.query(columns=["fruit_id", "name"], quals=[[("fruit_id", ">=", "3")]]) ) == [ {"fruit_id": 3, "name": "mayonnaise"}, {"fruit_id": 4, "name": "fruit_4"}, {"fruit_id": 5, "name": "fruit_5_updated"}, ] # This time we had to apply the fragments in the final group (since there were two of them) apply_fragments.assert_called_once_with( [ ( "splitgraph_meta", "oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc", ), ( "splitgraph_meta", "o15a420721b04e9749761b5368628cb15593cb8cfdcc547107b98eddda5031d", ), ], SPLITGRAPH_META_SCHEMA, mock.ANY, extra_qual_args=("3",), extra_quals=mock.ANY, schema_spec=mock.ANY, ) # Two calls to _generate_select_queries -- one to directly query the pk=3 chunk... assert _gsc.call_args_list == [ mock.call( pg_repo_local.engine, b'"splitgraph_meta".' b'"of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"', ["fruit_id", "name"], mock.ANY, ("3",), ), # ...and one to query the applied fragments in the second group. mock.call(pg_repo_local.engine, mock.ANY, ["fruit_id", "name"], mock.ANY, ("3",),), ] # Check the temporary table has been deleted since we've exhausted the query args, _ = apply_fragments.call_args_list[0] tmp_table = args[2] assert not pg_repo_local.engine.table_exists(SPLITGRAPH_META_SCHEMA, tmp_table) # Now query PKs 3 and 4. Even though the chunk containing PKs 4 and 5 was updated # (by changing PK 5), the qual filter should drop the update, as it's not pertinent # to the query. Hence, we should end up not needing fragment application. with mock.patch.object( PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments ) as apply_fragments: with mock.patch( "splitgraph.core.table._generate_select_query", side_effect=_generate_select_query ) as _gsc: assert list( fruits.query( columns=["fruit_id", "name"], quals=[[("fruit_id", "=", "3"), ("fruit_id", "=", "4")]], ) ) == [{"fruit_id": 3, "name": "mayonnaise"}, {"fruit_id": 4, "name": "fruit_4"}] # No fragment application assert apply_fragments.call_count == 0 # Single call to _generate_select_queries directly selecting rows from the two chunks assert _gsc.mock_calls == [ call( mock.ANY, b'"splitgraph_meta".' b'"of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"', ["fruit_id", "name"], mock.ANY, ("3", "4"), ), call( mock.ANY, b'"splitgraph_meta".' b'"oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc"', ["fruit_id", "name"], mock.ANY, ("3", "4"), ), ]
def test_provenance_inline_sql(readonly_pg_repo, pg_repo_local): prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True) pg_repo_local.head.tag("v2") execute_commands( load_splitfile("inline_sql.splitfile"), output=OUTPUT, ) new_head = OUTPUT.head remote_input = readonly_pg_repo.images["latest"] local_input = pg_repo_local.images["latest"] assert set(new_head.provenance()) == { ( readonly_pg_repo, remote_input.image_hash, ), (pg_repo_local, local_input.image_hash), } assert remote_input.provenance(reverse=True, engine=OUTPUT.engine) == [ (OUTPUT, OUTPUT.head.image_hash) ] assert local_input.provenance(reverse=True, engine=OUTPUT.engine) == [ (OUTPUT, OUTPUT.head.image_hash) ] expected_sql = ("SQL {{CREATE TABLE balanced_diet\n" " AS SELECT fruits.fruit_id AS id\n" " , fruits.name AS fruit\n" " , my_fruits.timestamp AS timestamp\n" " , vegetables.name AS vegetable\n" " FROM " '"otheruser/pg_mount:{0}".fruits AS ' "fruits\n" " INNER JOIN " '"otheruser/pg_mount:{0}".vegetables ' "AS vegetables ON fruits.fruit_id = vegetable_id\n" " LEFT JOIN " '"test/pg_mount:{1}".fruits AS ' "my_fruits ON my_fruits.fruit_id = fruits.fruit_id;\n" "\n" "ALTER TABLE balanced_diet ADD PRIMARY KEY (id)}}").format( remote_input.image_hash, local_input.image_hash) assert new_head.to_splitfile() == [expected_sql] assert new_head.to_splitfile(source_replacement={ pg_repo_local: "new_local_tag", readonly_pg_repo: "new_remote_tag" }) == [ expected_sql.replace(remote_input.image_hash, "new_remote_tag").replace(local_input.image_hash, "new_local_tag") ] assert len(OUTPUT.images()) == 2 # Try rerunning the Splitfile against the same original data (check caching) rebuild_image( OUTPUT.head, source_replacement={ pg_repo_local: "latest", readonly_pg_repo: "latest" }, ) assert len(OUTPUT.images()) == 2 # Change pg_repo_local and rerun the Splitfile against it. pg_repo_local.run_sql( "UPDATE fruits SET timestamp = '2020-01-01 12:00:00' WHERE fruit_id = 2" ) new_head = pg_repo_local.commit() rebuild_image( OUTPUT.head, source_replacement={ pg_repo_local: new_head.image_hash, readonly_pg_repo: "latest" }, ) assert len(OUTPUT.images()) == 3 assert OUTPUT.run_sql("SELECT * FROM balanced_diet") == [ (1, "apple", None, "potato"), (2, "orange", datetime.datetime(2020, 1, 1, 12, 0), "carrot"), ]