Пример #1
0
def test_import_splitfile_reuses_hash(local_engine_empty):
    # Create two repositories and run the same Splitfile that loads some data from a mounted database.
    # Check that the same contents result in the same hash and no extra objects being created
    output_2 = Repository.from_schema("output_2")

    execute_commands(load_splitfile("import_from_mounted_db.splitfile"),
                     output=OUTPUT)
    execute_commands(load_splitfile("import_from_mounted_db.splitfile"),
                     output=output_2)

    head = OUTPUT.head
    assert head.get_table("my_fruits").objects == [
        "o71ba35a5bbf8ac7779d8fe32226aaacc298773e154a4f84e9aabf829238fb1"
    ]
    assert head.get_table("o_vegetables").objects == [
        "o70e726f4bf18547242722600c4723dceaaede27db8fa5e9e6d7ec39187dd86"
    ]
    assert head.get_table("vegetables").objects == [
        "ob474d04a80c611fc043e8303517ac168444dc7518af60e4ccc56b3b0986470"
    ]
    assert head.get_table("all_fruits").objects == [
        "o0e742bd2ea4927f5193a2c68f8d4c51ea018b1ef3e3005a50727147d2cf57b"
    ]

    head_2 = output_2.head
    assert head_2.get_table("my_fruits").objects == head.get_table(
        "my_fruits").objects
    assert head_2.get_table("o_vegetables").objects == head.get_table(
        "o_vegetables").objects
    assert head_2.get_table("vegetables").objects == head.get_table(
        "vegetables").objects
    assert head_2.get_table("all_fruits").objects == head.get_table(
        "all_fruits").objects
Пример #2
0
def build_c(splitfile, args, output_repository):
    """
    Build Splitgraph images.

    This executes a Splitfile, building a new image or checking it out from cache if the same
    image had already been built.

    Examples:

    ``sgr build my.splitfile``

    Executes ``my.splitfile`` and writes its output into a new repository with the same name
    as the Splitfile (my) unless the name is specified in the Splitfile.

    ``sgr build my.splitfile -o mynew/repo``

    Executes ``my.splitfile`` and writes its output into ``mynew/repo``.

    ``sgr build my_other.splitfile -o mynew/otherrepo --args PARAM1 VAL1 --args PARAM2 VAL2``

    Executes ``my_other.splitfile`` with parameters ``PARAM1`` and ``PARAM2`` set to
    ``VAL1`` and  ``VAL2``, respectively.
    """
    from splitgraph.splitfile import execute_commands
    from splitgraph.core.repository import Repository

    args = {k: v for k, v in args}
    click.echo("Executing Splitfile %s with arguments %r" %
               (splitfile.name, args))

    if output_repository is None:
        file_name = os.path.splitext(os.path.basename(splitfile.name))[0]
        output_repository = Repository.from_schema(file_name)

    execute_commands(splitfile.read(), args, output=output_repository)
Пример #3
0
def test_rerun_with_from_import(local_engine_empty, pg_repo_remote_multitag):
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)

    output_v1 = OUTPUT.head
    # Do a logical rebase of the newly created image on the V2 of the remote repository

    rebuild_image(output_v1, {pg_repo_remote_multitag: "v2"})
    output_v2 = OUTPUT.head

    # Do some checks on the structure of the final output repo. In particular, make sure that the two derived versions
    # still exist and depend only on the respective tags of the source.
    v1 = pg_repo_remote_multitag.images["v1"]
    v2 = pg_repo_remote_multitag.images["v2"]
    assert output_v1.provenance() == [(pg_repo_remote_multitag, v1.image_hash)]
    assert output_v2.provenance() == [(pg_repo_remote_multitag, v2.image_hash)]

    ov1_log = output_v1.get_log()
    ov2_log = output_v2.get_log()

    # ov1_log: CREATE TABLE commit, then IMPORT from v1, then the 00000 commit
    # ov2_log: CREATE TABLE commit, then FROM v1, then the 00000.. commit
    assert ov1_log[2:] == ov2_log[2:]
    assert len(ov1_log) == 3
Пример #4
0
def test_rerun_with_new_version(local_engine_empty, pg_repo_remote_multitag):
    # Test running commands that base new datasets on a remote repository.
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)

    output_v1 = OUTPUT.head
    # Do a logical rebase of the newly created image on the V2 of the remote repository

    rebuild_image(output_v1, {pg_repo_remote_multitag: "v2"})
    output_v2 = OUTPUT.head
    assert local_engine_empty.run_sql("SELECT * FROM output.join_table") == [
        (2, "orange", "carrot")
    ]

    # Do some checks on the structure of the final output repo. In particular, make sure that the two derived versions
    # still exist and depend only on the respective tags of the source.
    v1 = pg_repo_remote_multitag.images["v1"]
    v2 = pg_repo_remote_multitag.images["v2"]
    assert output_v1.provenance() == [(pg_repo_remote_multitag, v1.image_hash)]
    assert output_v2.provenance() == [(pg_repo_remote_multitag, v2.image_hash)]

    ov1_log = output_v1.get_log()
    ov2_log = output_v2.get_log()

    # ov1_log: CREATE TABLE commit, then FROM v1
    # ov2_log: CREATE TABLE commit, then FROM v2 which is based on FROM v1 (since we cloned both from test/pg_mount),
    # then as previously.
    assert ov1_log[1:] == ov2_log[2:]
Пример #5
0
def test_rerun_multiline_sql_roundtripping(pg_repo_local):
    # Test that with a multiline SQL sgr rebuild doesn't create a new image
    # when rebuilding the same one.
    execute_commands(load_splitfile("multiline_sql.splitfile"), output=OUTPUT)

    head = OUTPUT.head
    expected_sql = "SQL {INSERT INTO fruits \n" "VALUES (3, 'banana')\n" "     , (4, 'pineapple')}"

    assert head.to_splitfile()[1] == expected_sql

    rebuild_image(head, {})
    head_v2 = OUTPUT.head
    assert head_v2.to_splitfile()[1] == expected_sql
    assert head_v2 == head
Пример #6
0
def test_provenance_with_from(local_engine_empty, pg_repo_remote_multitag):
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)
    dependencies = OUTPUT.head.provenance()

    assert dependencies == [(pg_repo_remote_multitag,
                             pg_repo_remote_multitag.images["v1"].image_hash)]

    source = pg_repo_remote_multitag.images["v1"]
    assert source.provenance(reverse=True, engine=local_engine_empty) == [
        (OUTPUT, OUTPUT.head.image_hash)
    ]

    assert source.provenance() == []
Пример #7
0
def test_provenance(local_engine_empty, pg_repo_remote_multitag):
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)
    dependencies = OUTPUT.head.provenance()

    assert dependencies == [(pg_repo_remote_multitag,
                             pg_repo_remote_multitag.images["v1"].image_hash)]

    # Check reverse provenance. Since the repository lives on the remote engine, we need to
    # search for dependents on the local engine instead.
    source = pg_repo_remote_multitag.images["v1"]
    assert source.provenance(reverse=True, engine=local_engine_empty) == [
        (OUTPUT, OUTPUT.head.image_hash)
    ]

    assert source.provenance() == []
Пример #8
0
def test_splitfile_recreate(local_engine_empty, pg_repo_remote_multitag):
    execute_commands(
        load_splitfile("import_with_custom_query_and_sql.splitfile"),
        params={"TAG": "v1"},
        output=OUTPUT,
    )
    recreated_commands = OUTPUT.head.to_splitfile()
    assert recreated_commands == [
        "FROM test/pg_mount:%s IMPORT {SELECT *\n" %
        pg_repo_remote_multitag.images["v1"].image_hash + """FROM fruits
WHERE name = 'orange'} AS my_fruits, {SELECT *
FROM vegetables
WHERE name LIKE '%o'} AS o_vegetables, vegetables AS vegetables, fruits AS all_fruits""",
        """SQL {CREATE TABLE test_table
  AS SELECT *
     FROM all_fruits}""",
    ]
Пример #9
0
def test_splitfile_recreate_custom_from(local_engine_empty,
                                        pg_repo_remote_multitag):
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)
    recreated_commands = OUTPUT.head.to_splitfile()

    assert recreated_commands == [
        "FROM test/pg_mount:%s" %
        pg_repo_remote_multitag.images["v1"].image_hash,
        # Test provenance is recorded using the reformatted SQL
        """SQL {CREATE TABLE join_table
  AS SELECT fruit_id AS id
          , fruits.name AS fruit
          , vegetables.name AS vegetable
     FROM fruits
          INNER JOIN vegetables ON fruit_id = vegetable_id}""",
    ]
Пример #10
0
def test_splitfile_incomplete_provenance(local_engine_empty,
                                         pg_repo_remote_multitag):
    # This splitfile has a MOUNT as its first command. Check we emit partial splitfiles
    # instead of errors (we can't reproduce MOUNT commands).
    execute_commands(
        load_splitfile("import_from_mounted_db_with_sql.splitfile"),
        params={"TAG": "v1"},
        output=OUTPUT,
    )
    head_img = OUTPUT.head
    image_with_mount = head_img.get_log()[-2]
    recreated_commands = head_img.to_splitfile(ignore_irreproducible=True)

    assert recreated_commands == [
        "# Irreproducible Splitfile command of type MOUNT",
        "SQL {CREATE TABLE new_table\n"
        "  AS SELECT *\n"
        "     FROM all_fruits}",
    ]
Пример #11
0
def test_dummy_command(pg_repo_local):
    # Basic test to make sure the config gets wired to the splitfile executor and the arguments
    # are passed to it correctly.
    execute_commands(load_splitfile("custom_command_dummy.splitfile"),
                     output=OUTPUT)
    log = OUTPUT.head.get_log()

    assert (
        len(log) == 3
    )  # Base 000.., import from test/pg_mount, DUMMY run that created a dupe image
    assert log[0].get_tables() == log[1].get_tables()
    assert log[0].comment == 'DUMMY arg1 --arg2 "argument three"'

    # Run the command again -- since it returns a random hash every time, it should add yet another image to the base.
    execute_commands(load_splitfile("custom_command_dummy.splitfile"),
                     output=OUTPUT)
    new_log = OUTPUT.head.get_log()

    # Two common images -- 0000... and the import
    assert new_log[2] == log[2]
    assert new_log[1] == log[1]

    # However, the DUMMY command created a new image with a random hash
    assert new_log[0] != log[0]
Пример #12
0
def test_calc_hash_short_circuit(pg_repo_local):
    # Test that if the hash returned by calc_hash is unchanged, we don't run execute() again
    execute_commands(load_splitfile("custom_command_calc_hash.splitfile"),
                     output=OUTPUT)

    # Run 1: table gets dropped (since the image doesn't exist)
    log = OUTPUT.head.get_log()
    assert len(
        log) == 3  # Base 000.., import from test/pg_mount, drop table fruits
    assert log[0].get_tables() == []
    # Hash: combination of the previous image hash and the command context (unchanged)
    assert log[0].image_hash == _combine_hashes(
        [log[1].image_hash, "deadbeef" * 8])

    # Run 2: same command context hash, same original image -- no effect
    with patch(
            "test.splitgraph.splitfile.test_custom_commands.CalcHashTestCommand.execute"
    ) as cmd:
        execute_commands(load_splitfile("custom_command_calc_hash.splitfile"),
                         output=OUTPUT)
        new_log = OUTPUT.head.get_log()
        assert new_log == log
        assert cmd.call_count == 0

    # Run 3: alter test_pg_mount (same command context hash but different image)
    pg_repo_local.run_sql(
        """UPDATE fruits SET name = 'banana' where fruit_id = 1""")
    pg_repo_local.commit()
    with patch(
            "test.splitgraph.splitfile.test_custom_commands.CalcHashTestCommand.execute"
    ) as cmd:
        execute_commands(load_splitfile("custom_command_calc_hash.splitfile"),
                         output=OUTPUT)
        log_3 = OUTPUT.head.get_log()

        assert cmd.call_count == 1
        assert len(log_3) == 3

        # Since we patched the execute() out, it won't have run the DROP TABLE command so we don't check for that.
        # However, the sg_meta is still altered.
        assert log_3[0].image_hash == _combine_hashes(
            [log_3[1].image_hash, "deadbeef" * 8])
        # Import from test/pg_mount changed (since the actual repo changed)
        assert log_3[1].image_hash != log[1].image_hash
        # Base layer (00000...) unchanged
        assert log_3[2].image_hash == log[2].image_hash
Пример #13
0
def test_custom_command_errors(pg_repo_local):
    # Test we raise for undefined commands
    with pytest.raises(SplitfileError) as e:
        execute_commands(
            load_splitfile("custom_command_dummy.splitfile").replace(
                "DUMMY", "NOP"),
            output=OUTPUT)
    assert "Custom command NOP not found in the config!" in str(e.value)

    # Test we raise for commands that can't be imported
    with pytest.raises(SplitfileError) as e:
        execute_commands(
            load_splitfile("custom_command_dummy.splitfile").replace(
                "DUMMY", "BROKEN1"),
            output=OUTPUT,
        )
    assert "Error loading custom command BROKEN1" in str(e.value)
    with pytest.raises(SplitfileError) as e:
        execute_commands(
            load_splitfile("custom_command_dummy.splitfile").replace(
                "DUMMY", "BROKEN2"),
            output=OUTPUT,
        )
    assert "Error loading custom command BROKEN2" in str(e.value)
Пример #14
0
def test_provenance_inline_sql(readonly_pg_repo, pg_repo_local):
    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)
    pg_repo_local.head.tag("v2")

    execute_commands(
        load_splitfile("inline_sql.splitfile"),
        output=OUTPUT,
    )

    new_head = OUTPUT.head

    remote_input = readonly_pg_repo.images["latest"]
    local_input = pg_repo_local.images["latest"]

    assert set(new_head.provenance()) == {
        (
            readonly_pg_repo,
            remote_input.image_hash,
        ),
        (pg_repo_local, local_input.image_hash),
    }

    assert remote_input.provenance(reverse=True, engine=OUTPUT.engine) == [
        (OUTPUT, OUTPUT.head.image_hash)
    ]

    assert local_input.provenance(reverse=True, engine=OUTPUT.engine) == [
        (OUTPUT, OUTPUT.head.image_hash)
    ]
    expected_sql = ("SQL {{CREATE TABLE balanced_diet\n"
                    "  AS SELECT fruits.fruit_id AS id\n"
                    "          , fruits.name AS fruit\n"
                    "          , my_fruits.timestamp AS timestamp\n"
                    "          , vegetables.name AS vegetable\n"
                    "     FROM "
                    '"otheruser/pg_mount:{0}".fruits AS '
                    "fruits\n"
                    "          INNER JOIN "
                    '"otheruser/pg_mount:{0}".vegetables '
                    "AS vegetables ON fruits.fruit_id = vegetable_id\n"
                    "          LEFT JOIN "
                    '"test/pg_mount:{1}".fruits AS '
                    "my_fruits ON my_fruits.fruit_id = fruits.fruit_id;\n"
                    "\n"
                    "ALTER TABLE balanced_diet ADD PRIMARY KEY (id)}}").format(
                        remote_input.image_hash, local_input.image_hash)

    assert new_head.to_splitfile() == [expected_sql]

    assert new_head.to_splitfile(source_replacement={
        pg_repo_local: "new_local_tag",
        readonly_pg_repo: "new_remote_tag"
    }) == [
        expected_sql.replace(remote_input.image_hash,
                             "new_remote_tag").replace(local_input.image_hash,
                                                       "new_local_tag")
    ]

    assert len(OUTPUT.images()) == 2

    # Try rerunning the Splitfile against the same original data (check caching)
    rebuild_image(
        OUTPUT.head,
        source_replacement={
            pg_repo_local: "latest",
            readonly_pg_repo: "latest"
        },
    )

    assert len(OUTPUT.images()) == 2

    # Change pg_repo_local and rerun the Splitfile against it.
    pg_repo_local.run_sql(
        "UPDATE fruits SET timestamp = '2020-01-01 12:00:00' WHERE fruit_id = 2"
    )
    new_head = pg_repo_local.commit()

    rebuild_image(
        OUTPUT.head,
        source_replacement={
            pg_repo_local: new_head.image_hash,
            readonly_pg_repo: "latest"
        },
    )

    assert len(OUTPUT.images()) == 3
    assert OUTPUT.run_sql("SELECT * FROM balanced_diet") == [
        (1, "apple", None, "potato"),
        (2, "orange", datetime.datetime(2020, 1, 1, 12, 0), "carrot"),
    ]