Пример #1
0
def test_disjoint_table_lq_one_singleton(pg_repo_local):
    # Test querying tables that have multiple single chunks that don't overlap each other.
    # Those must be queried directly without being applied to a staging area.

    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    fruits = pg_repo_local.images["latest"].get_table("fruits")

    # Quick sanity checks/assertions to show which chunks in the table overlap which.
    assert _get_chunk_groups(fruits) == [
        [
            # Group 1: original two rows (PKs 1 and 2)...
            ("of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5", (1,), (2,)),
            # ...then deletion of 'apple' (PK 1)
            ("o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d", (1,), (1,)),
            # ...then update PK 2 to 'guitar'
            ("o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc", (2,), (2,)),
        ],
        # Group 2: even though this insertion happened first, it's separated out
        # as it can be applied independently.
        [("of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b", (3,), (3,))],
    ]

    # Run query that only touches the chunk with pk=3: since we skip over the chunks in the first group,
    # we aren't supposed to call apply_fragments and just query the oa32... chunk directly.
    with mock.patch.object(
        PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments
    ) as apply_fragments:
        assert list(
            fruits.query(columns=["fruit_id", "name"], quals=[[("fruit_id", "=", "3")]])
        ) == [{"fruit_id": 3, "name": "mayonnaise"}]
        assert apply_fragments.call_count == 0
Пример #2
0
def _setup_object_cache_test(pg_repo_remote, longer_chain=False):
    pg_repo_local = clone(pg_repo_remote)
    pg_repo_local.images["latest"].checkout()
    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)
    if longer_chain:
        pg_repo_local.run_sql("INSERT INTO FRUITS VALUES (4, 'kumquat')")
        pg_repo_local.commit()

    # Same setup as the LQ test in the beginning: we clone a repo from upstream, don't download anything, all
    # objects are on Minio.
    remote = pg_repo_local.push(handler="S3", handler_options={})
    pg_repo_local.delete()
    pg_repo_remote.objects.delete_objects(
        remote.objects.get_downloaded_objects())
    pg_repo_remote.commit_engines()
    pg_repo_local.objects.cleanup()
    pg_repo_local = clone(pg_repo_remote, download_all=False)

    # 6 objects in the tree (original fragment, new base fragment and a patch on top of that fragment
    # for both tables)
    assert len(pg_repo_local.objects.get_all_objects()
               ) == 6 if not longer_chain else 7
    assert len(pg_repo_local.objects.get_downloaded_objects()) == 0
    assert len(
        remote.objects.get_all_objects()) == 6 if not longer_chain else 7
    assert len(remote.objects.get_downloaded_objects()) == 0

    # Nothing has yet been downloaded (cache entries only for externally downloaded things)
    assert (len(
        pg_repo_local.engine.run_sql(
            "SELECT * FROM splitgraph_meta.object_cache_status")) == 0)

    return pg_repo_local
Пример #3
0
def test_lq_remote(local_engine_empty, pg_repo_remote):
    # Test layered querying works when we initialize it on a cloned repo that doesn't have any
    # cached objects (all are on the remote).

    # 1 patch on top of fruits, 1 patch on top of vegetables
    prepare_lq_repo(pg_repo_remote, commit_after_every=False, include_pk=True)
    pg_repo_local = clone(pg_repo_remote, download_all=False)
    _test_lazy_lq_checkout(pg_repo_local)
Пример #4
0
def test_splitfile_inline_sql(readonly_pg_repo, pg_repo_local):
    # Test SQL commands accessing repos directly -- join a remote repo with
    # some local data.

    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)
    pg_repo_local.head.tag("v2")

    execute_commands(
        load_splitfile("inline_sql.splitfile"),
        output=OUTPUT,
    )

    new_head = OUTPUT.head
    new_head.checkout()
    assert new_head.get_tables() == ["balanced_diet"]
    assert OUTPUT.run_sql("SELECT * FROM balanced_diet") == [
        (1, "apple", None, "potato"),
        (2, "orange", datetime.datetime(2019, 1, 1, 12, 0), "carrot"),
    ]

    local_repo_head = pg_repo_local.head.image_hash
    other_repo_head = readonly_pg_repo.images["latest"].image_hash

    assert new_head.provenance_data == [
        {
            "sources": [
                {
                    "source": "pg_mount",
                    "source_hash": other_repo_head,
                    "source_namespace": "otheruser",
                },
                {
                    "source": "pg_mount",
                    "source_hash": local_repo_head,
                    "source_namespace": "test"
                },
            ],
            "sql": ("CREATE TABLE balanced_diet\n"
                    "  AS SELECT fruits.fruit_id AS id\n"
                    "          , fruits.name AS fruit\n"
                    "          , my_fruits.timestamp AS timestamp\n"
                    "          , vegetables.name AS vegetable\n"
                    "     FROM "
                    '"otheruser/pg_mount:{0}".fruits '
                    "AS fruits\n"
                    "          INNER JOIN "
                    '"otheruser/pg_mount:{0}".vegetables '
                    "AS vegetables ON fruits.fruit_id = vegetable_id\n"
                    "          LEFT JOIN "
                    '"test/pg_mount:{1}".fruits '
                    "AS my_fruits ON my_fruits.fruit_id = fruits.fruit_id;\n"
                    "\n"
                    "ALTER TABLE balanced_diet ADD PRIMARY KEY (id)").format(
                        other_repo_head, local_repo_head),
            "type":
            "SQL",
        },
    ]
Пример #5
0
def test_layered_querying_against_single_fragment(pg_repo_local):
    # Test the case where the query is satisfied by a single fragment.
    prepare_lq_repo(pg_repo_local, snap_only=True, commit_after_every=False, include_pk=True)
    new_head = pg_repo_local.head
    new_head.checkout(layered=True)

    assert pg_repo_local.run_sql(
        "SELECT * FROM fruits WHERE name IN ('guitar', 'mayonnaise') ORDER BY fruit_id"
    ) == [(2, "guitar", 1, _DT), (3, "mayonnaise", 1, _DT)]
Пример #6
0
def test_disjoint_table_lq_two_singletons_one_overwritten(pg_repo_local):
    # Add another two rows to the table with PKs 4 and 5
    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    pg_repo_local.run_sql(
        "INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')")
    pg_repo_local.commit()
    pg_repo_local.run_sql(
        "UPDATE fruits SET name = 'fruit_5_updated' WHERE fruit_id = 5")
    fruits = pg_repo_local.commit().get_table("fruits")

    assert _get_chunk_groups(fruits) == [
        [
            ("of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5",
             (1, ), (2, )),
            ("o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d",
             (1, ), (1, )),
            ("o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc",
             (2, ), (2, )),
        ],
        [("of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b",
          (3, ), (3, ))],
        # The pk=5 update has to be added to the last chunk group, making it a non-singleton
        [
            ("oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc",
             (4, ), (5, )),
            ("o15a420721b04e9749761b5368628cb15593cb8cfdcc547107b98eddda5031d",
             (5, ), (5, )),
        ],
    ]

    with mock.patch.object(
            PostgresEngine,
            "apply_fragments",
            wraps=pg_repo_local.engine.apply_fragments) as apply_fragments:
        with mock.patch("splitgraph.core.table._generate_select_query",
                        side_effect=_generate_select_query) as _gsc:
            assert list(
                fruits.query(columns=["fruit_id", "name"],
                             quals=[[("fruit_id", ">=", "3")]])) == [
                                 {
                                     "fruit_id": 3,
                                     "name": "mayonnaise"
                                 },
                                 {
                                     "fruit_id": 4,
                                     "name": "fruit_4"
                                 },
                                 {
                                     "fruit_id": 5,
                                     "name": "fruit_5_updated"
                                 },
                             ]

            # This time we had to apply the fragments in the final group (since there were two of them)
            _assert_fragments_applied(_gsc, apply_fragments, pg_repo_local)
Пример #7
0
def test_disjoint_table_lq_two_singletons_one_overwritten_indirect(
        pg_repo_local):
    # Now test scanning the dataset with two singletons and one non-singleton group
    # by consuming queries one-by-one.

    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    pg_repo_local.run_sql(
        "INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')")
    fruits = pg_repo_local.commit().get_table("fruits")

    tables, callback, _ = fruits.query_indirect(columns=["fruit_id", "name"],
                                                quals=None)

    # At this point, we've "claimed" all objects but haven't done anything with them.
    # We're not really testing object claiming here since the objects were created locally
    # (see test_object_cache_deferred in test_object_cache.py for a test for claims/releases)
    assert len(callback) == 1

    # First, we emit cstore tables
    assert (
        next(tables) == b'"splitgraph_meta".'
        b'"of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"')
    assert len(callback) == 1

    # There's another singleton we can query directly.
    assert (
        next(tables) == b'"splitgraph_meta".'
        b'"oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc"')
    assert len(callback) == 1

    # We have two fragments left to scan but they overlap each other, so they have to be materialized.
    with mock.patch.object(
            PostgresEngine,
            "apply_fragments",
            wraps=pg_repo_local.engine.apply_fragments) as apply_fragments:
        next(tables)
        assert apply_fragments.call_count == 1
        args, _ = apply_fragments.call_args_list[0]
        tmp_table = args[2]

    # Because of this, our callback list now includes deleting the temporary table
    assert len(callback) == 2

    # We've now exhausted the list of queries
    with pytest.raises(StopIteration):
        next(tables)

    # ...but haven't called the callback yet
    assert pg_repo_local.engine.table_exists(SPLITGRAPH_META_SCHEMA, tmp_table)

    # Call the callback now, deleting the temporary table.
    callback()
    assert not pg_repo_local.engine.table_exists(SPLITGRAPH_META_SCHEMA,
                                                 tmp_table)
Пример #8
0
def _prepare_fully_remote_repo(local_engine_empty, pg_repo_remote_registry):
    # Setup: same as external, with an extra patch on top of the fruits table.
    pg_repo_local = clone(pg_repo_remote_registry)
    pg_repo_local.images["latest"].checkout()
    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'kumquat')")
    pg_repo_local.commit()
    pg_repo_local.push(handler="S3", handler_options={})
    pg_repo_local.delete()
    pg_repo_local.objects.cleanup()
    pg_repo_local.commit_engines()
Пример #9
0
def test_layered_querying_type_conversion(pg_repo_local):
    # For type bigint, Multicorn for some reason converts quals to be strings. Test we can handle that.
    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)
    pg_repo_local.run_sql("ALTER TABLE fruits ALTER COLUMN fruit_id TYPE bigint")
    pg_repo_local.commit()
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'kumquat', 42, '2018-01-02T03:04:05')")
    new_head = pg_repo_local.commit()
    new_head.checkout(layered=True)

    # Make sure ANY works on integers (not converted to strings)
    assert pg_repo_local.run_sql(
        "SELECT * FROM fruits WHERE fruit_id IN (3, 4) ORDER BY fruit_id"
    ) == [(3, "mayonnaise", 1, _DT), (4, "kumquat", 42, dt(2018, 1, 2, 3, 4, 5))]
Пример #10
0
def test_disjoint_table_lq_two_singletons(pg_repo_local):
    # Add another two rows to the table with PKs 4 and 5
    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')")
    fruits = pg_repo_local.commit().get_table("fruits")
    # The new fragment lands in a separate group
    assert _get_chunk_groups(fruits) == [
        [
            ("of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5", (1,), (2,)),
            ("o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d", (1,), (1,)),
            ("o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc", (2,), (2,)),
        ],
        [("of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b", (3,), (3,))],
        [("oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc", (4,), (5,))],
    ]

    # Query hitting PKs 3, 4 and 5: they hit single chunks that don't depend on anything,
    # so we still shouldn't be applying fragments.
    with mock.patch.object(
        PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments
    ) as apply_fragments:

        with mock.patch(
            "splitgraph.core.table._generate_select_query", side_effect=_generate_select_query
        ) as _gsc:
            assert list(
                fruits.query(columns=["fruit_id", "name"], quals=[[("fruit_id", ">=", "3")]])
            ) == [
                {"fruit_id": 3, "name": "mayonnaise"},
                {"fruit_id": 4, "name": "fruit_4"},
                {"fruit_id": 5, "name": "fruit_5"},
            ]
            assert apply_fragments.call_count == 0

            # Check that we generated two SELECT queries
            assert _gsc.mock_calls == [
                call(
                    mock.ANY,
                    b'"splitgraph_meta"."of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"',
                    ["fruit_id", "name"],
                    mock.ANY,
                    ("3",),
                ),
                call(
                    mock.ANY,
                    b'"splitgraph_meta"."oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc"',
                    ["fruit_id", "name"],
                    mock.ANY,
                    ("3",),
                ),
            ]
Пример #11
0
def test_disjoint_table_lq_indirect(pg_repo_local):
    # Test querying tables indirectly (returning some tables)
    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    fruits = pg_repo_local.images["latest"].get_table("fruits")

    result, callback, _ = fruits.query_indirect(columns=["fruit_id", "name"],
                                                quals=[[("fruit_id", "=", "3")]
                                                       ])

    # Check that we get a list of chunks to query and a callback to release the results
    assert list(result) == [
        b'"splitgraph_meta"."of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"'
    ]
    assert len(callback) == 1
Пример #12
0
def test_disjoint_table_lq_temp_table_deletion_doesnt_lock_up(pg_repo_local):
    # When Multicorn reads from the temporary table, it does that in the context of the
    # transaction that it's been called from. It hence can hold a read lock on the
    # temporary table that it won't release until the scan is over but the scan won't
    # be over until we've managed to delete the temporary table, leading to a deadlock.
    # Check deleting the table in a separate thread works.

    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    pg_repo_local.run_sql(
        "INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')")
    fruits = pg_repo_local.commit().get_table("fruits")

    tables, callback, plan = fruits.query_indirect(
        columns=["fruit_id", "name"], quals=None)

    # Force a materialization and get the query that makes us read from the temporary table.
    last_table = list(tables)[-1]
    pg_repo_local.commit_engines()

    # Simulate Multicorn reading from the table by doing it from a different engine.
    conn_params = _prepare_engine_config(CONFIG)
    engine = PostgresEngine(conn_params=conn_params, name="test_engine")
    query = _generate_select_query(engine, last_table, ["fruit_id", "name"])
    engine.run_sql(query)
    logging.info("Acquired read lock")

    # At this point, we're holding a lock on the table and callback will lock up, unless it's run
    # in a separate thread.
    callback(from_fdw=True)

    # Wait for the thread to actually start (a bit gauche but otherwise makes the test flaky(ier))
    time.sleep(1)

    # We still can read from the table (callback spawned a thread which is locked trying to delete it).
    engine.run_sql(query)

    # Now drop the lock.
    logging.info("Dropping the lock")
    engine.rollback()

    # If this test assertion fails, make sure we're not left holding the lock.
    engine.autocommit = True
    with pytest.raises(ObjectNotFoundError):
        engine.run_sql(query)
Пример #13
0
def test_lq_external(local_engine_empty, unprivileged_pg_repo,
                     pg_repo_remote_registry, clean_minio):
    # Test layered querying works when we initialize it on a cloned repo that doesn't have any
    # cached objects (all are on S3 or other external location).

    pg_repo_local = clone(unprivileged_pg_repo)
    pg_repo_local.images["latest"].checkout()
    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)

    # Setup: upstream has the same repository as in the previous test but with no cached objects (all are external).
    # In addition, we check that LQ works against an unprivileged upstream (where we don't actually have
    # admin access).
    pg_repo_local.push(unprivileged_pg_repo, handler="S3", handler_options={})
    pg_repo_local.delete()
    pg_repo_local.objects.cleanup()

    assert len(pg_repo_local.objects.get_all_objects()) == 0
    assert len(pg_repo_local.objects.get_downloaded_objects()) == 0
    assert len(pg_repo_remote_registry.objects.get_all_objects()) == 6

    # Proceed as per the previous test
    pg_repo_local = clone(unprivileged_pg_repo, download_all=False)
    _test_lazy_lq_checkout(pg_repo_local)
Пример #14
0
def test_disjoint_table_lq_two_singletons_one_overwritten(pg_repo_local):
    # Add another two rows to the table with PKs 4 and 5
    prepare_lq_repo(pg_repo_local, commit_after_every=True, include_pk=True)
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (4, 'fruit_4'), (5, 'fruit_5')")
    pg_repo_local.commit()
    pg_repo_local.run_sql("UPDATE fruits SET name = 'fruit_5_updated' WHERE fruit_id = 5")
    fruits = pg_repo_local.commit().get_table("fruits")

    assert _get_chunk_groups(fruits) == [
        [
            ("of22f20503d3bf17c7449b545d68ebcee887ed70089f0342c4bff38862c0dc5", (1,), (2,)),
            ("o23fe42d48d7545596d0fea1c48bcf7d64bde574d437c77cc5bb611e5f8849d", (1,), (1,)),
            ("o3f81f6c40ecc3366d691a2ce45f41f6f180053020607cbd0873baf0c4447dc", (2,), (2,)),
        ],
        [("of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b", (3,), (3,))],
        # The pk=5 update has to be added to the last chunk group, making it a non-singleton
        [
            ("oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc", (4,), (5,)),
            ("o15a420721b04e9749761b5368628cb15593cb8cfdcc547107b98eddda5031d", (5,), (5,)),
        ],
    ]

    with mock.patch.object(
        PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments
    ) as apply_fragments:
        with mock.patch(
            "splitgraph.core.table._generate_select_query", side_effect=_generate_select_query
        ) as _gsc:
            assert list(
                fruits.query(columns=["fruit_id", "name"], quals=[[("fruit_id", ">=", "3")]])
            ) == [
                {"fruit_id": 3, "name": "mayonnaise"},
                {"fruit_id": 4, "name": "fruit_4"},
                {"fruit_id": 5, "name": "fruit_5_updated"},
            ]

            # This time we had to apply the fragments in the final group (since there were two of them)
            apply_fragments.assert_called_once_with(
                [
                    (
                        "splitgraph_meta",
                        "oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc",
                    ),
                    (
                        "splitgraph_meta",
                        "o15a420721b04e9749761b5368628cb15593cb8cfdcc547107b98eddda5031d",
                    ),
                ],
                SPLITGRAPH_META_SCHEMA,
                mock.ANY,
                extra_qual_args=("3",),
                extra_quals=mock.ANY,
                schema_spec=mock.ANY,
            )

            # Two calls to _generate_select_queries -- one to directly query the pk=3 chunk...
            assert _gsc.call_args_list == [
                mock.call(
                    pg_repo_local.engine,
                    b'"splitgraph_meta".'
                    b'"of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"',
                    ["fruit_id", "name"],
                    mock.ANY,
                    ("3",),
                ),
                # ...and one to query the applied fragments in the second group.
                mock.call(pg_repo_local.engine, mock.ANY, ["fruit_id", "name"], mock.ANY, ("3",),),
            ]

            # Check the temporary table has been deleted since we've exhausted the query
            args, _ = apply_fragments.call_args_list[0]
            tmp_table = args[2]
            assert not pg_repo_local.engine.table_exists(SPLITGRAPH_META_SCHEMA, tmp_table)

    # Now query PKs 3 and 4. Even though the chunk containing PKs 4 and 5 was updated
    # (by changing PK 5), the qual filter should drop the update, as it's not pertinent
    # to the query. Hence, we should end up not needing fragment application.
    with mock.patch.object(
        PostgresEngine, "apply_fragments", wraps=pg_repo_local.engine.apply_fragments
    ) as apply_fragments:
        with mock.patch(
            "splitgraph.core.table._generate_select_query", side_effect=_generate_select_query
        ) as _gsc:
            assert list(
                fruits.query(
                    columns=["fruit_id", "name"],
                    quals=[[("fruit_id", "=", "3"), ("fruit_id", "=", "4")]],
                )
            ) == [{"fruit_id": 3, "name": "mayonnaise"}, {"fruit_id": 4, "name": "fruit_4"}]

            # No fragment application
            assert apply_fragments.call_count == 0

            # Single call to _generate_select_queries directly selecting rows from the two chunks
            assert _gsc.mock_calls == [
                call(
                    mock.ANY,
                    b'"splitgraph_meta".'
                    b'"of0fb43e477311f82aa30055be303ff00599dfe155d737def0d00f06e07228b"',
                    ["fruit_id", "name"],
                    mock.ANY,
                    ("3", "4"),
                ),
                call(
                    mock.ANY,
                    b'"splitgraph_meta".'
                    b'"oaa6d009e485bfa91aec4ab6b0ed1ebcd67055f6a3420d29f26446b034f41cc"',
                    ["fruit_id", "name"],
                    mock.ANY,
                    ("3", "4"),
                ),
            ]
Пример #15
0
def test_provenance_inline_sql(readonly_pg_repo, pg_repo_local):
    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)
    pg_repo_local.head.tag("v2")

    execute_commands(
        load_splitfile("inline_sql.splitfile"),
        output=OUTPUT,
    )

    new_head = OUTPUT.head

    remote_input = readonly_pg_repo.images["latest"]
    local_input = pg_repo_local.images["latest"]

    assert set(new_head.provenance()) == {
        (
            readonly_pg_repo,
            remote_input.image_hash,
        ),
        (pg_repo_local, local_input.image_hash),
    }

    assert remote_input.provenance(reverse=True, engine=OUTPUT.engine) == [
        (OUTPUT, OUTPUT.head.image_hash)
    ]

    assert local_input.provenance(reverse=True, engine=OUTPUT.engine) == [
        (OUTPUT, OUTPUT.head.image_hash)
    ]
    expected_sql = ("SQL {{CREATE TABLE balanced_diet\n"
                    "  AS SELECT fruits.fruit_id AS id\n"
                    "          , fruits.name AS fruit\n"
                    "          , my_fruits.timestamp AS timestamp\n"
                    "          , vegetables.name AS vegetable\n"
                    "     FROM "
                    '"otheruser/pg_mount:{0}".fruits AS '
                    "fruits\n"
                    "          INNER JOIN "
                    '"otheruser/pg_mount:{0}".vegetables '
                    "AS vegetables ON fruits.fruit_id = vegetable_id\n"
                    "          LEFT JOIN "
                    '"test/pg_mount:{1}".fruits AS '
                    "my_fruits ON my_fruits.fruit_id = fruits.fruit_id;\n"
                    "\n"
                    "ALTER TABLE balanced_diet ADD PRIMARY KEY (id)}}").format(
                        remote_input.image_hash, local_input.image_hash)

    assert new_head.to_splitfile() == [expected_sql]

    assert new_head.to_splitfile(source_replacement={
        pg_repo_local: "new_local_tag",
        readonly_pg_repo: "new_remote_tag"
    }) == [
        expected_sql.replace(remote_input.image_hash,
                             "new_remote_tag").replace(local_input.image_hash,
                                                       "new_local_tag")
    ]

    assert len(OUTPUT.images()) == 2

    # Try rerunning the Splitfile against the same original data (check caching)
    rebuild_image(
        OUTPUT.head,
        source_replacement={
            pg_repo_local: "latest",
            readonly_pg_repo: "latest"
        },
    )

    assert len(OUTPUT.images()) == 2

    # Change pg_repo_local and rerun the Splitfile against it.
    pg_repo_local.run_sql(
        "UPDATE fruits SET timestamp = '2020-01-01 12:00:00' WHERE fruit_id = 2"
    )
    new_head = pg_repo_local.commit()

    rebuild_image(
        OUTPUT.head,
        source_replacement={
            pg_repo_local: new_head.image_hash,
            readonly_pg_repo: "latest"
        },
    )

    assert len(OUTPUT.images()) == 3
    assert OUTPUT.run_sql("SELECT * FROM balanced_diet") == [
        (1, "apple", None, "potato"),
        (2, "orange", datetime.datetime(2020, 1, 1, 12, 0), "carrot"),
    ]