예제 #1
0
def test_splitfile_end_to_end_with_uploading(local_engine_empty, remote_engine,
                                             pg_repo_remote_multitag,
                                             mg_repo_remote, clean_minio):
    # An end-to-end test:
    #   * Create a derived dataset from some tables imported from the remote engine
    #   * Push it back to the remote engine, uploading all objects to S3 (instead of the remote engine itself)
    #   * Delete everything from pgcache
    #   * Run another splitfile that depends on the just-pushed dataset (and does lazy checkouts to
    #     get the required tables).

    # Do the same setting up first and run the splitfile against the remote data.
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)

    remote_output = Repository(OUTPUT.namespace, OUTPUT.repository,
                               remote_engine)

    # Push with upload
    OUTPUT.push(remote_repository=remote_output,
                handler="S3",
                handler_options={})
    # Unmount everything locally and cleanup
    for mountpoint, _ in get_current_repositories(local_engine_empty):
        mountpoint.delete()
    OUTPUT.objects.cleanup()

    stage_2 = R("output_stage_2")
    execute_commands(
        load_splitfile("import_from_preuploaded_remote.splitfile"),
        output=stage_2)

    assert stage_2.run_sql("SELECT id, name, fruit, vegetable FROM diet") == [
        (2, "James", "orange", "carrot")
    ]
예제 #2
0
def test_from_remote(local_engine_empty, pg_repo_remote_multitag):
    # Test running commands that base new datasets on a remote repository.
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)

    new_head = OUTPUT.head
    parent = OUTPUT.images.by_hash(new_head.parent_id)
    # Go back to the parent: the two source tables should exist there
    parent.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    new_head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    # Now run the same splitfile but from the v2 of the remote (where row 1 has been removed from the fruits table)
    # First, remove the output mountpoint (the executor tries to fetch the commit 0000 from it otherwise which
    # doesn't exist).
    OUTPUT.delete()
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v2"},
                     output=OUTPUT)

    assert OUTPUT.run_sql("SELECT * FROM join_table") == [(2, "orange",
                                                           "carrot")]
    def run(self, upstream_repos: Dict[str, str] = None, splitfile_commands: str = None, output: Workspace = None, **kwargs: Any):
        """

        Args:

        Returns:
            - No return
        """
        repo_infos = dict((name, parse_repo(uri)) for (name, uri) in upstream_repos.items())
        v1_sgr_repo_uris = dict((name, repo_info.v1_sgr_uri()) for (name, repo_info) in repo_infos.items())
 

        formatting_kwargs = {
            **v1_sgr_repo_uris,
            **kwargs,
            **prefect.context.get("parameters", {}).copy(),
            **prefect.context,
        }


        repo_info = parse_repo(output['repo_uri'])
        repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository)
     
        execute_commands(
            splitfile_commands, 
            params=formatting_kwargs, 
            output=repo, 
            # output_base=output['image_hash'],
        )
예제 #4
0
def test_splitfile_inline_sql(readonly_pg_repo, pg_repo_local):
    # Test SQL commands accessing repos directly -- join a remote repo with
    # some local data.

    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)
    pg_repo_local.head.tag("v2")

    execute_commands(
        load_splitfile("inline_sql.splitfile"),
        output=OUTPUT,
    )

    new_head = OUTPUT.head
    new_head.checkout()
    assert new_head.get_tables() == ["balanced_diet"]
    assert OUTPUT.run_sql("SELECT * FROM balanced_diet") == [
        (1, "apple", None, "potato"),
        (2, "orange", datetime.datetime(2019, 1, 1, 12, 0), "carrot"),
    ]

    local_repo_head = pg_repo_local.head.image_hash
    other_repo_head = readonly_pg_repo.images["latest"].image_hash

    assert new_head.provenance_data == [
        {
            "sources": [
                {
                    "source": "pg_mount",
                    "source_hash": other_repo_head,
                    "source_namespace": "otheruser",
                },
                {
                    "source": "pg_mount",
                    "source_hash": local_repo_head,
                    "source_namespace": "test"
                },
            ],
            "sql": ("CREATE TABLE balanced_diet\n"
                    "  AS SELECT fruits.fruit_id AS id\n"
                    "          , fruits.name AS fruit\n"
                    "          , my_fruits.timestamp AS timestamp\n"
                    "          , vegetables.name AS vegetable\n"
                    "     FROM "
                    '"otheruser/pg_mount:{0}".fruits '
                    "AS fruits\n"
                    "          INNER JOIN "
                    '"otheruser/pg_mount:{0}".vegetables '
                    "AS vegetables ON fruits.fruit_id = vegetable_id\n"
                    "          LEFT JOIN "
                    '"test/pg_mount:{1}".fruits '
                    "AS my_fruits ON my_fruits.fruit_id = fruits.fruit_id;\n"
                    "\n"
                    "ALTER TABLE balanced_diet ADD PRIMARY KEY (id)").format(
                        other_repo_head, local_repo_head),
            "type":
            "SQL",
        },
    ]
예제 #5
0
def test_splitfile_remote_hash(local_engine_empty, pg_repo_remote):
    head = pg_repo_remote.head.image_hash
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": head[:10]},
                     output=OUTPUT)
    assert OUTPUT.run_sql(
        "SELECT id, fruit, vegetable FROM output.join_table") == [
            (1, "apple", "potato"),
            (2, "orange", "carrot"),
        ]
예제 #6
0
def test_local_import_splitfile(pg_repo_local):
    execute_commands(load_splitfile("import_local.splitfile"), output=OUTPUT)
    head = OUTPUT.head
    old_head = head.parent_id

    OUTPUT.images.by_hash(old_head).checkout()
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")

    head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
예제 #7
0
def test_drawing(pg_repo_local):
    # Doesn't really check anything, mostly used to make sure the tree drawing code doesn't throw.
    execute_commands(load_splitfile("import_local.splitfile"), output=OUTPUT)

    # Make another branch to check multi-branch repositories can render.
    pg_repo_local.images()[1].checkout()
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (3, 'kiwi')")
    pg_repo_local.commit()

    rebuild_image(OUTPUT.head, {pg_repo_local: pg_repo_local.head.image_hash})

    render_tree(OUTPUT)
예제 #8
0
def test_update_without_import_splitfile(pg_repo_local):
    # Test that correct commits are produced by executing an splitfile (both against newly created and already
    # existing tables on an existing mountpoint)
    execute_commands(load_splitfile("update_without_import.splitfile"),
                     output=OUTPUT)
    log = OUTPUT.head.get_log()

    log[1].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == []

    log[0].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple")]
예제 #9
0
def test_splitfile_sql_failure(local_engine_empty, pg_repo_remote_multitag):
    assert len(OUTPUT.images()) == 0
    assert _get_table_count(OUTPUT) == 0

    with pytest.raises(psycopg2.errors.UndefinedTable) as e:
        execute_commands(load_splitfile("import_remote_broken_stage_2.splitfile"), output=OUTPUT)
    assert 'relation "nonexistent_fruits_table" does not exist' in str(e.value)

    # Check the execution created the first dummy (000...) image and the second image
    # with IMPORT results
    assert len(OUTPUT.images()) == 2
    assert _get_table_count(OUTPUT) == 2
    assert sorted(OUTPUT.images["latest"].get_tables()) == ["my_fruits", "vegetables"]
예제 #10
0
def test_basic_splitfile(pg_repo_local):
    execute_commands(load_splitfile("create_table.splitfile"), output=OUTPUT)
    log = list(reversed(OUTPUT.head.get_log()))

    log[1].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == []

    log[2].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple")]

    log[3].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple"),
                                                         (2, "banana")]
예제 #11
0
def test_from_remote_hash(local_engine_empty, pg_repo_remote):
    head = pg_repo_remote.head.image_hash
    # Test running commands that base new datasets on a remote repository.
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": head[:10]},
                     output=OUTPUT)

    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
예제 #12
0
def test_splitfile_with_external_sql(readonly_pg_repo):

    # Tests are running from root so we pass in the path to the SQL manually to the splitfile.
    execute_commands(
        load_splitfile("external_sql.splitfile"),
        params={"EXTERNAL_SQL_FILE": RESOURCES + "external_sql.sql"},
        output=OUTPUT,
    )

    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
예제 #13
0
def test_splitfile_cached(pg_repo_local):
    # Check that no new commits/snaps are created if we rerun the same splitfile
    execute_commands(
        load_splitfile("import_local_multiple_with_queries.splitfile"),
        output=OUTPUT)
    images = OUTPUT.images()
    assert len(images) == 4

    execute_commands(
        load_splitfile("import_local_multiple_with_queries.splitfile"),
        output=OUTPUT)
    new_images = OUTPUT.images()
    assert new_images == images
예제 #14
0
def test_from_multistage(local_engine_empty, pg_repo_remote_multitag):
    stage_2 = R("output_stage_2")

    # Produces two repositories: output and output_stage_2
    execute_commands(load_splitfile("from_remote_multistage.splitfile"),
                     params={"TAG": "v1"})

    # Check the final output ('output_stage_2'): it should only have one single fragment (join_table
    # from the first stage, OUTPUT.
    assert stage_2.run_sql("SELECT * FROM balanced_diet") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
    # Check the commit is based on the original empty image.
    assert stage_2.head.parent_id == "0" * 64
    assert stage_2.head.get_tables() == ["balanced_diet"]
예제 #15
0
def test_import_all(local_engine_empty):
    execute_commands(load_splitfile("import_all_from_mounted.splitfile"),
                     output=OUTPUT)

    head = OUTPUT.head
    old_head = OUTPUT.images.by_hash(head.parent_id)

    old_head.checkout()
    tables = ["vegetables", "fruits"]
    contents = [[(1, "potato"), (2, "carrot")], [(1, "apple"), (2, "orange")]]
    for t in tables:
        assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), t)

    head.checkout()
    for t, c in zip(tables, contents):
        assert OUTPUT.run_sql("SELECT * FROM %s" % t) == c
예제 #16
0
def test_splitfile_object_download_failure(local_engine_empty, pg_repo_remote_multitag):
    # Simulate an object download failure (that happens inside of the engine during IMPORT
    # execution) propagating to the caller and not leaving the engine in an inconsistent state.

    object_id = pg_repo_remote_multitag.images["v1"].get_table("fruits").objects[0]
    assert object_id == "o0e742bd2ea4927f5193a2c68f8d4c51ea018b1ef3e3005a50727147d2cf57b"
    tmp_object_id = "o" + "0" * 62

    pg_repo_remote_multitag.engine.run_sql(
        SQL("ALTER TABLE splitgraph_meta.{} RENAME TO {}").format(
            Identifier(object_id), Identifier(tmp_object_id)
        )
    )

    assert len(OUTPUT.images()) == 0
    assert _get_table_count(OUTPUT) == 0

    with pytest.raises(ObjectCacheError) as e:
        execute_commands(
            load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v1"}, output=OUTPUT
        )
    assert "Missing 1 object (%s)" % object_id in str(e.value)

    # Check the execution didn't create the image
    assert len(OUTPUT.images()) == 0
    assert _get_table_count(OUTPUT) == 0

    # Rename the object back and retry the Splitfile
    pg_repo_remote_multitag.engine.run_sql(
        SQL("ALTER TABLE splitgraph_meta.{} RENAME TO {}").format(
            Identifier(tmp_object_id), Identifier(object_id)
        )
    )

    execute_commands(
        load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v1"}, output=OUTPUT
    )
    OUTPUT.head.checkout()
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    assert len(OUTPUT.images()) == 3
    # 2 tables in the first non-empty image, 3 tables in the second image
    # (previous 2 + joined data).
    assert _get_table_count(OUTPUT) == 5
예제 #17
0
def test_from_local(pg_repo_local):
    execute_commands(load_splitfile("from_local.splitfile"), output=OUTPUT)

    new_head = OUTPUT.head
    # Go back to the parent: the two source tables should exist there
    OUTPUT.images.by_hash(new_head.parent_id).checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    new_head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
예제 #18
0
def test_splitfile_remote(local_engine_empty, pg_repo_remote_multitag):
    # We use the v1 tag when importing from the remote, so fruit_id = 1 still exists there.
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    # Now run the commands against v2 and make sure the fruit_id = 1 has disappeared from the output.
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": "v2"},
                     output=OUTPUT)
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (2, "orange", "carrot")
    ]
예제 #19
0
def test_advanced_splitfile(pg_repo_local):
    execute_commands(
        load_splitfile("import_local_multiple_with_queries.splitfile"),
        output=OUTPUT)

    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    head = OUTPUT.head
    old_head = head.parent_id
    OUTPUT.images.by_hash(old_head).checkout()
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")
    head.checkout()
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (2, "orange", "carrot")
    ]
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(2, "orange")]
예제 #20
0
def test_splitfile_schema_changes(pg_repo_local, mg_repo_local):
    execute_commands(load_splitfile("schema_changes.splitfile"), output=OUTPUT)
    old_output_head = OUTPUT.head

    # Then, alter the dataset and rerun the splitfile.
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (12, 'mayonnaise')")
    pg_repo_local.commit()
    execute_commands(load_splitfile("schema_changes.splitfile"), output=OUTPUT)
    new_output_head = OUTPUT.head

    old_output_head.checkout()
    assert OUTPUT.run_sql("SELECT * FROM spirit_fruits") == [("James",
                                                              "orange", 12)]

    new_output_head.checkout()
    # Mayonnaise joined with Alex, ID 12 + 10 = 22.
    assert OUTPUT.run_sql("SELECT * FROM spirit_fruits") == [
        ("James", "orange", 12),
        ("Alex", "mayonnaise", 22),
    ]
예제 #21
0
def test_import_with_custom_query(pg_repo_local):
    # Test that importing with a custom query creates a new object
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise');"
                          "INSERT INTO vegetables VALUES (3, 'oregano')")
    pg_repo_local.commit()

    all_current_objects = pg_repo_local.objects.get_all_objects()

    execute_commands(load_splitfile("import_with_custom_query.splitfile"),
                     output=OUTPUT)
    head = OUTPUT.head
    old_head = OUTPUT.images.by_hash(head.parent_id)

    # First two tables imported as new objects since they had a custom query, the other two get pointed
    # to the old pg_repo_local objects.
    tables = ["my_fruits", "o_vegetables", "vegetables", "all_fruits"]
    contents = [
        [(2, "orange")],
        [(1, "potato"), (3, "oregano")],
        [(1, "potato"), (2, "carrot"), (3, "oregano")],
        [(1, "apple"), (2, "orange"), (3, "mayonnaise")],
    ]

    old_head.checkout()
    engine = OUTPUT.engine
    for t in tables:
        assert not engine.table_exists(OUTPUT.to_schema(), t)

    head.checkout()
    for t, c in zip(tables, contents):
        assert sorted(OUTPUT.run_sql("SELECT * FROM %s" % t)) == sorted(c)

    for t in tables:
        objects = head.get_table(t).objects
        if t in ["my_fruits", "o_vegetables"]:
            assert all(o not in all_current_objects for o in objects)
        else:
            assert all(o in all_current_objects for o in objects)
예제 #22
0
def test_import_updating_splitfile_with_uploading(local_engine_empty,
                                                  remote_engine,
                                                  pg_repo_remote):
    execute_commands(load_splitfile("import_and_update.splitfile"),
                     output=OUTPUT)
    head = OUTPUT.head

    assert len(OUTPUT.objects.get_all_objects()
               ) == 4  # Two original tables + two updates

    # Push with upload. Have to specify the remote repo.
    remote_output = Repository(OUTPUT.namespace, OUTPUT.repository,
                               remote_engine)
    OUTPUT.push(remote_output, handler="S3", handler_options={})
    # Unmount everything locally and cleanup
    OUTPUT.delete()

    # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object
    # manager for the engine (maybe should inject it into local_engine/remote_engine instead)
    OUTPUT.objects.cleanup()
    assert not OUTPUT.objects.get_all_objects()

    clone(OUTPUT.to_schema(), download_all=False)

    assert not OUTPUT.objects.get_downloaded_objects()
    existing_objects = list(OUTPUT.objects.get_all_objects())
    assert len(existing_objects) == 4  # Two original tables + two updates
    # Only 2 objects are stored externally (the other two have been on the remote the whole time)
    assert len(
        OUTPUT.objects.get_external_object_locations(existing_objects)) == 2

    head.checkout()
    assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]