Exemplo n.º 1
0
def test_base_fragment_reused_chunking(local_engine_empty):
    # Check that if we split a table into chunks and some chunks are the same, they get assigned to the same objects.
    OUTPUT.init()
    base = OUTPUT.head
    _make_test_table(OUTPUT)
    OUTPUT.commit(chunk_size=5)
    table_1 = OUTPUT.head.get_table("test")
    # Table 1 produced 3 objects
    assert len(OUTPUT.objects.get_all_objects()) == 3

    # All chunks are the same
    base.checkout()
    _make_test_table(OUTPUT)
    OUTPUT.commit(chunk_size=5)
    table_2 = OUTPUT.head.get_table("test")
    assert len(OUTPUT.objects.get_all_objects()) == 3
    assert table_1.objects == table_2.objects

    # Insert something else into the middle chunk so that it's different. This will get conflated so won't get recorded
    # as an update.
    base.checkout()
    _make_test_table(OUTPUT)
    OUTPUT.run_sql(
        "UPDATE test SET value_1 = 'UPDATED', value_2 = 42 WHERE key = 7")
    OUTPUT.commit(chunk_size=5)
    table_3 = OUTPUT.head.get_table("test")
    assert len(OUTPUT.objects.get_all_objects()) == 4
    # Table 3 reused the first and the last object but created a new one for the middle fragment.
    assert len(table_3.objects) == 3
    assert table_3.objects[0] == table_1.objects[0]
    assert table_3.objects[1] != table_1.objects[1]
    assert table_3.objects[2] == table_1.objects[2]
Exemplo n.º 2
0
def _setup_dataset():
    OUTPUT.init()
    OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar);
        INSERT INTO test VALUES (1, 'test')""")
    OUTPUT.commit()
    OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')")
    return OUTPUT.commit()
Exemplo n.º 3
0
def test_from_remote(local_engine_empty, pg_repo_remote_multitag):
    # Test running commands that base new datasets on a remote repository.
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)

    new_head = OUTPUT.head
    parent = OUTPUT.images.by_hash(new_head.parent_id)
    # Go back to the parent: the two source tables should exist there
    parent.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    new_head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    # Now run the same splitfile but from the v2 of the remote (where row 1 has been removed from the fruits table)
    # First, remove the output mountpoint (the executor tries to fetch the commit 0000 from it otherwise which
    # doesn't exist).
    OUTPUT.delete()
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v2"},
                     output=OUTPUT)

    assert OUTPUT.run_sql("SELECT * FROM join_table") == [(2, "orange",
                                                           "carrot")]
Exemplo n.º 4
0
def test_import_bare(pg_repo_local):
    # Check import without checking anything out, just by manipulating metadata and running LQs against
    # source images.

    # Create a new schema and import 'fruits'
    OUTPUT.init()
    # Make sure the existing table is preserved.
    OUTPUT.run_sql("CREATE TABLE sentinel (key INTEGER)")
    OUTPUT.commit()
    pg_repo_local.uncheckout()
    OUTPUT.uncheckout()

    OUTPUT.import_tables(
        tables=["imported_fruits"],
        source_repository=pg_repo_local,
        image_hash=pg_repo_local.images["latest"].image_hash,
        source_tables=["SELECT * FROM fruits WHERE fruit_id = 1"],
        parent_hash=OUTPUT.images["latest"].image_hash,
        do_checkout=False,
        table_queries=[True],
    )

    assert OUTPUT.head is None
    assert pg_repo_local.head is None

    assert sorted(OUTPUT.images["latest"].get_tables()) == [
        "imported_fruits", "sentinel"
    ]
    assert list(OUTPUT.images["latest"].get_table("imported_fruits").query(
        columns=["name"], quals=[])) == [{
            "name": "apple"
        }]
Exemplo n.º 5
0
def test_import_and_update(local_engine_empty, unprivileged_pg_repo):
    OUTPUT.init()
    head = OUTPUT.head
    remote_head = unprivileged_pg_repo.images["latest"]
    # Import the 'fruits' table from the origin.
    import_table_from_remote(unprivileged_pg_repo, ["fruits"],
                             remote_head.image_hash,
                             OUTPUT,
                             target_tables=[])
    new_head = OUTPUT.head

    OUTPUT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    new_head_2 = OUTPUT.commit()

    head.checkout()
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")

    new_head.checkout()
    assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"),
                                                      (2, "orange")]

    new_head_2.checkout()
    assert OUTPUT.run_sql("SELECT * FROM fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]
Exemplo n.º 6
0
def test_bloom_index_post_factum(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 TIMESTAMP)")
    for i in range(50):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s)",
                       (i + 1, dt(2015, 1, 1) + timedelta(days=i)))
    head = OUTPUT.commit()

    # Reindex the table after committing it and check that object metadata gets overwritten.
    head.get_table("test").reindex(
        extra_indexes={"bloom": {
            "value_1": {
                "probability": 0.01
            }
        }})

    obj = head.get_table("test").objects[0]
    object_index = OUTPUT.objects.get_object_meta([obj])[obj].object_index
    assert "bloom" in object_index

    # Run a snippet from the previous test to check the bloom metadata is valid.
    objects = head.get_table("test").objects
    for i in range(0, 50, 5):
        assert (filter_bloom_index(OUTPUT.engine, objects, [[
            ("value_1", "=", dt(2015, 1, 1) + timedelta(days=i))
        ]]) == objects)
Exemplo n.º 7
0
def test_bloom_index_datetime(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 TIMESTAMP)")
    for i in range(50):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s)",
                       (i + 1, dt(2015, 1, 1) + timedelta(days=i)))
    head = OUTPUT.commit(
        extra_indexes={"test": {
            "bloom": {
                "value_1": {
                    "probability": 0.01
                }
            }
        }})
    objects = head.get_table("test").objects

    # Datetimes are supported in the bloom index if they're passed in as actual datetime
    # objects (which Multicorn does).

    # Spot check some dates that exist...
    for i in range(0, 50, 5):
        assert (filter_bloom_index(OUTPUT.engine, objects, [[
            ("value_1", "=", dt(2015, 1, 1) + timedelta(days=i))
        ]]) == objects)

    # ...and some that don't.
    assert (filter_bloom_index(
        OUTPUT.engine, objects,
        [[("value_1", "=", dt(2015, 1, 1) + timedelta(days=55))]]) == [])

    # They also work if passed in as ISO strings with space as a separator.
    assert (filter_bloom_index(
        OUTPUT.engine, objects,
        [[("value_1", "=", "2015-01-01 00:00:00")]]) == objects)
Exemplo n.º 8
0
def test_update_without_import_splitfile(pg_repo_local):
    # Test that correct commits are produced by executing an splitfile (both against newly created and already
    # existing tables on an existing mountpoint)
    execute_commands(load_splitfile("update_without_import.splitfile"),
                     output=OUTPUT)
    log = OUTPUT.head.get_log()

    log[1].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == []

    log[0].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple")]
Exemplo n.º 9
0
def test_basic_splitfile(pg_repo_local):
    execute_commands(load_splitfile("create_table.splitfile"), output=OUTPUT)
    log = list(reversed(OUTPUT.head.get_log()))

    log[1].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == []

    log[2].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple")]

    log[3].checkout()
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(1, "pineapple"),
                                                         (2, "banana")]
Exemplo n.º 10
0
def test_rollback_on_error(local_engine_empty):
    # For e.g. commit/checkout/other commands, we don't do commits/rollbacks
    # in the library itself and expect the caller to manage transactions. In CLI,
    # we need to make sure that erroneous transactions (e.g. interrupted SG commits)
    # are rolled back correctly instead of being committed.
    runner = CliRunner()

    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)"
    )
    for i in range(11):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)",
                       (i + 1, chr(ord("a") + i), i * 2))
    head = OUTPUT.commit(chunk_size=5,
                         in_fragment_order={"test": ["key", "value_1"]})
    assert len(OUTPUT.images()) == 2
    assert len(OUTPUT.objects.get_all_objects()) == 3

    _alter_diff_splitting_dataset()
    OUTPUT.commit_engines()

    # Simulate the commit getting interrupted by the first object going through and being
    # recorded, then a KeyboardInterrupt being raised.
    called_once = False

    def interrupted_register(*args, **kwargs):
        nonlocal called_once
        if called_once:
            raise BaseException("something went wrong")
        else:
            called_once = True
            return FragmentManager._register_object(*args, **kwargs)

    with patch(
            "splitgraph.core.fragment_manager.FragmentManager._register_object",
            side_effect=interrupted_register,
    ) as ro:
        with pytest.raises(BaseException):
            runner.invoke(cli, ["commit", OUTPUT.to_schema()])

    # Check that no image/object metadata was written
    assert len(OUTPUT.images()) == 2
    assert len(OUTPUT.objects.get_all_objects()) == 3

    assert ro.call_count == 2

    # Check that the data in the audit trigger wasn't deleted
    assert len(
        OUTPUT.engine.get_pending_changes(OUTPUT.to_schema(),
                                          table="test")) == 6
Exemplo n.º 11
0
def test_import_preserves_pending_changes(pg_repo_local):
    OUTPUT.init()
    OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar);
            INSERT INTO test VALUES (1, 'test')""")
    head = OUTPUT.commit()
    OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')")
    changes = get_engine().get_pending_changes(OUTPUT.to_schema(), "test")

    OUTPUT.import_tables(tables=["imported_fruits"],
                         source_repository=pg_repo_local,
                         source_tables=["fruits"])

    assert OUTPUT.head.parent_id == head.image_hash
    assert changes == OUTPUT.engine.get_pending_changes(
        OUTPUT.to_schema(), "test")
Exemplo n.º 12
0
def test_import_from_remote(local_engine_empty, unprivileged_pg_repo):
    # Start with a clean repo -- add a table to output to see if it's preserved.
    head = _setup_dataset()

    local_objects = OUTPUT.objects

    assert len(local_objects.get_downloaded_objects()) == 2
    assert len(local_objects.get_all_objects()) == 2
    assert local_engine_empty.get_all_tables(OUTPUT.to_schema()) == ["test"]

    # Import the 'fruits' table from the origin.
    remote_head = unprivileged_pg_repo.images["latest"]
    import_table_from_remote(unprivileged_pg_repo, ["fruits"],
                             remote_head.image_hash,
                             OUTPUT,
                             target_tables=[])
    new_head = OUTPUT.head

    # Check that the table now exists in the output, is committed and there's no trace of the cloned repo.
    # Also clean up the unused objects to make sure that the newly cloned table is still recorded.
    assert sorted(local_engine_empty.get_all_tables(
        OUTPUT.to_schema())) == ["fruits", "test"]
    local_objects.cleanup()
    assert len(get_current_repositories(local_engine_empty)) == 1
    head.checkout()
    assert local_engine_empty.table_exists(OUTPUT.to_schema(), "test")
    assert not local_engine_empty.table_exists(OUTPUT.to_schema(), "fruits")

    new_head.checkout()
    assert local_engine_empty.table_exists(OUTPUT.to_schema(), "test")
    assert local_engine_empty.table_exists(OUTPUT.to_schema(), "fruits")

    assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"),
                                                      (2, "orange")]
Exemplo n.º 13
0
def test_splitfile_remote(local_engine_empty, pg_repo_remote_multitag):
    # We use the v1 tag when importing from the remote, so fruit_id = 1 still exists there.
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    # Now run the commands against v2 and make sure the fruit_id = 1 has disappeared from the output.
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": "v2"},
                     output=OUTPUT)
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (2, "orange", "carrot")
    ]
Exemplo n.º 14
0
def test_splitfile_inline_sql(readonly_pg_repo, pg_repo_local):
    # Test SQL commands accessing repos directly -- join a remote repo with
    # some local data.

    prepare_lq_repo(pg_repo_local, commit_after_every=False, include_pk=True)
    pg_repo_local.head.tag("v2")

    execute_commands(
        load_splitfile("inline_sql.splitfile"),
        output=OUTPUT,
    )

    new_head = OUTPUT.head
    new_head.checkout()
    assert new_head.get_tables() == ["balanced_diet"]
    assert OUTPUT.run_sql("SELECT * FROM balanced_diet") == [
        (1, "apple", None, "potato"),
        (2, "orange", datetime.datetime(2019, 1, 1, 12, 0), "carrot"),
    ]

    local_repo_head = pg_repo_local.head.image_hash
    other_repo_head = readonly_pg_repo.images["latest"].image_hash

    assert new_head.provenance_data == [
        {
            "sources": [
                {
                    "source": "pg_mount",
                    "source_hash": other_repo_head,
                    "source_namespace": "otheruser",
                },
                {
                    "source": "pg_mount",
                    "source_hash": local_repo_head,
                    "source_namespace": "test"
                },
            ],
            "sql": ("CREATE TABLE balanced_diet\n"
                    "  AS SELECT fruits.fruit_id AS id\n"
                    "          , fruits.name AS fruit\n"
                    "          , my_fruits.timestamp AS timestamp\n"
                    "          , vegetables.name AS vegetable\n"
                    "     FROM "
                    '"otheruser/pg_mount:{0}".fruits '
                    "AS fruits\n"
                    "          INNER JOIN "
                    '"otheruser/pg_mount:{0}".vegetables '
                    "AS vegetables ON fruits.fruit_id = vegetable_id\n"
                    "          LEFT JOIN "
                    '"test/pg_mount:{1}".fruits '
                    "AS my_fruits ON my_fruits.fruit_id = fruits.fruit_id;\n"
                    "\n"
                    "ALTER TABLE balanced_diet ADD PRIMARY KEY (id)").format(
                        other_repo_head, local_repo_head),
            "type":
            "SQL",
        },
    ]
Exemplo n.º 15
0
def test_layered_querying_json_arrays(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, "
        "value JSONB, arr_value INTEGER[], arr_2d_value TEXT[][])"
    )
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (1, %s, %s, %s)",
        (json.dumps({"a": 1, "b": 2.5}), [1, 2, 3], [["one", "two"], ["three", "four"]]),
    )
    OUTPUT.commit()
    OUTPUT.run_sql(
        "INSERT INTO test VALUES (2, %s, %s, %s)",
        (
            json.dumps({"a": "one", "b": "two point five"}),
            [4, 5, 6],
            [["five", "six"], ["seven", "eight"]],
        ),
    )
    head = OUTPUT.commit()
    OUTPUT.uncheckout()
    head.checkout(layered=True)

    assert OUTPUT.run_sql("SELECT * FROM test ORDER BY key") == [
        (1, {"a": 1, "b": 2.5}, [1, 2, 3], [["one", "two"], ["three", "four"]]),
        (2, {"a": "one", "b": "two point five"}, [4, 5, 6], [["five", "six"], ["seven", "eight"]]),
    ]
Exemplo n.º 16
0
def test_advanced_splitfile(pg_repo_local):
    execute_commands(
        load_splitfile("import_local_multiple_with_queries.splitfile"),
        output=OUTPUT)

    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    head = OUTPUT.head
    old_head = head.parent_id
    OUTPUT.images.by_hash(old_head).checkout()
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")
    head.checkout()
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (2, "orange", "carrot")
    ]
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(2, "orange")]
Exemplo n.º 17
0
def test_splitfile_remote_hash(local_engine_empty, pg_repo_remote):
    head = pg_repo_remote.head.image_hash
    execute_commands(load_splitfile("import_remote_multiple.splitfile"),
                     params={"TAG": head[:10]},
                     output=OUTPUT)
    assert OUTPUT.run_sql(
        "SELECT id, fruit, vegetable FROM output.join_table") == [
            (1, "apple", "potato"),
            (2, "orange", "carrot"),
        ]
Exemplo n.º 18
0
def test_splitfile(local_engine_empty, pg_repo_remote):
    runner = CliRunner()

    result = runner.invoke(
        build_c,
        [
            RESOURCES + "import_remote_multiple.splitfile",
            "-a",
            "TAG",
            "latest",
            "-o",
            "output",
        ],
    )
    assert result.exit_code == 0
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    # Test the sgr provenance command. First, just list the dependencies of the new image.
    result = runner.invoke(provenance_c, ["output:latest"])
    assert "test/pg_mount:%s" % pg_repo_remote.images[
        "latest"].image_hash in result.output

    # Second, output the full splitfile (-f)
    result = runner.invoke(provenance_c, ["output:latest", "-f"])
    assert ("FROM test/pg_mount:%s IMPORT" %
            pg_repo_remote.images["latest"].image_hash in result.output)
    assert "SQL {CREATE TABLE join_table" in result.output

    # Test reverse dependencies
    # We're looking at test/pg_mount on the local engine which doesn't exist -- this should fail.
    result = runner.invoke(
        dependents_c,
        [
            "test/pg_mount:%s" % pg_repo_remote.images["latest"].image_hash,
        ],
    )
    assert result.exit_code == 1

    # Now look at test/pg_mount on the remote and look for dependents on the local engine.
    result = runner.invoke(
        dependents_c,
        [
            "test/pg_mount:%s" % pg_repo_remote.images["latest"].image_hash,
            "--dependents-on",
            "LOCAL",
            "--source-on",
            pg_repo_remote.engine.name,
        ],
    )
    assert result.exit_code == 0
    assert "is depended on by" in result.output
    assert "%s:%s" % (OUTPUT, OUTPUT.head.image_hash) in result.output
Exemplo n.º 19
0
def test_splitfile_schema_changes(pg_repo_local, mg_repo_local):
    execute_commands(load_splitfile("schema_changes.splitfile"), output=OUTPUT)
    old_output_head = OUTPUT.head

    # Then, alter the dataset and rerun the splitfile.
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (12, 'mayonnaise')")
    pg_repo_local.commit()
    execute_commands(load_splitfile("schema_changes.splitfile"), output=OUTPUT)
    new_output_head = OUTPUT.head

    old_output_head.checkout()
    assert OUTPUT.run_sql("SELECT * FROM spirit_fruits") == [("James",
                                                              "orange", 12)]

    new_output_head.checkout()
    # Mayonnaise joined with Alex, ID 12 + 10 = 22.
    assert OUTPUT.run_sql("SELECT * FROM spirit_fruits") == [
        ("James", "orange", 12),
        ("Alex", "mayonnaise", 22),
    ]
Exemplo n.º 20
0
def test_splitfile_with_external_sql(readonly_pg_repo):

    # Tests are running from root so we pass in the path to the SQL manually to the splitfile.
    execute_commands(
        load_splitfile("external_sql.splitfile"),
        params={"EXTERNAL_SQL_FILE": RESOURCES + "external_sql.sql"},
        output=OUTPUT,
    )

    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
Exemplo n.º 21
0
def test_from_remote_hash(local_engine_empty, pg_repo_remote):
    head = pg_repo_remote.head.image_hash
    # Test running commands that base new datasets on a remote repository.
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": head[:10]},
                     output=OUTPUT)

    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
Exemplo n.º 22
0
def test_import_basic(pg_repo_local):
    # Create a new schema and import 'fruits' from the mounted PG table.
    OUTPUT.init()
    head = OUTPUT.head

    OUTPUT.import_tables(tables=["imported_fruits"],
                         source_repository=pg_repo_local,
                         source_tables=["fruits"])

    assert OUTPUT.run_sql("SELECT * FROM imported_fruits"
                          ) == pg_repo_local.run_sql("SELECT * FROM fruits")
    new_head = OUTPUT.head

    assert new_head != head
    assert new_head.parent_id == head.image_hash
Exemplo n.º 23
0
def test_import_multiple_tables(pg_repo_local):
    OUTPUT.init()
    head = OUTPUT.head
    OUTPUT.import_tables(tables=[],
                         source_repository=pg_repo_local,
                         source_tables=[])

    for table_name in ["fruits", "vegetables"]:
        assert OUTPUT.run_sql("SELECT * FROM %s" %
                              table_name) == pg_repo_local.run_sql(
                                  "SELECT * FROM %s" % table_name)

    new_head = OUTPUT.head
    assert new_head != head
    assert new_head.parent_id == head.image_hash
Exemplo n.º 24
0
def test_range_index_ordering_collation(local_engine_empty):
    # Test that range index gets min/max values of text columns using the "C" collation
    # (sort by byte values of characters) rather than anything else (e.g. in en-US
    # "a" comes before "B" even though "B" has a smaller ASCII code).

    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key_1 INTEGER, key_2 VARCHAR,"
        " value_1 VARCHAR, value_2 INTEGER, PRIMARY KEY (key_1, key_2))")

    OUTPUT.engine.run_sql_batch(
        "INSERT INTO test VALUES (%s, %s, %s, %s)",
        [
            (1, "ONE", "apple", 4),
            (1, "one", "ORANGE", 3),
            (2, "two", "banana", 2),
            (2, "TWO", "CUCUMBER", 1),
        ],
        schema=OUTPUT.to_schema(),
    )

    head = OUTPUT.commit()
    object_id = head.get_table("test").objects[0]

    assert OUTPUT.objects.get_object_meta([object_id
                                           ])[object_id].object_index == {
                                               "range": {
                                                   "$pk": [[1, "ONE"],
                                                           [2, "two"]],
                                                   "key_1": [1, 2],
                                                   "key_2": ["ONE", "two"],
                                                   "value_1":
                                                   ["CUCUMBER", "banana"],
                                                   "value_2": [1, 4],
                                               }
                                           }
Exemplo n.º 25
0
def test_import_all(local_engine_empty):
    execute_commands(load_splitfile("import_all_from_mounted.splitfile"),
                     output=OUTPUT)

    head = OUTPUT.head
    old_head = OUTPUT.images.by_hash(head.parent_id)

    old_head.checkout()
    tables = ["vegetables", "fruits"]
    contents = [[(1, "potato"), (2, "carrot")], [(1, "apple"), (2, "orange")]]
    for t in tables:
        assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), t)

    head.checkout()
    for t, c in zip(tables, contents):
        assert OUTPUT.run_sql("SELECT * FROM %s" % t) == c
Exemplo n.º 26
0
def test_splitfile_object_download_failure(local_engine_empty, pg_repo_remote_multitag):
    # Simulate an object download failure (that happens inside of the engine during IMPORT
    # execution) propagating to the caller and not leaving the engine in an inconsistent state.

    object_id = pg_repo_remote_multitag.images["v1"].get_table("fruits").objects[0]
    assert object_id == "o0e742bd2ea4927f5193a2c68f8d4c51ea018b1ef3e3005a50727147d2cf57b"
    tmp_object_id = "o" + "0" * 62

    pg_repo_remote_multitag.engine.run_sql(
        SQL("ALTER TABLE splitgraph_meta.{} RENAME TO {}").format(
            Identifier(object_id), Identifier(tmp_object_id)
        )
    )

    assert len(OUTPUT.images()) == 0
    assert _get_table_count(OUTPUT) == 0

    with pytest.raises(ObjectCacheError) as e:
        execute_commands(
            load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v1"}, output=OUTPUT
        )
    assert "Missing 1 object (%s)" % object_id in str(e.value)

    # Check the execution didn't create the image
    assert len(OUTPUT.images()) == 0
    assert _get_table_count(OUTPUT) == 0

    # Rename the object back and retry the Splitfile
    pg_repo_remote_multitag.engine.run_sql(
        SQL("ALTER TABLE splitgraph_meta.{} RENAME TO {}").format(
            Identifier(tmp_object_id), Identifier(object_id)
        )
    )

    execute_commands(
        load_splitfile("import_remote_multiple.splitfile"), params={"TAG": "v1"}, output=OUTPUT
    )
    OUTPUT.head.checkout()
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    assert len(OUTPUT.images()) == 3
    # 2 tables in the first non-empty image, 3 tables in the second image
    # (previous 2 + joined data).
    assert _get_table_count(OUTPUT) == 5
Exemplo n.º 27
0
def test_from_local(pg_repo_local):
    execute_commands(load_splitfile("from_local.splitfile"), output=OUTPUT)

    new_head = OUTPUT.head
    # Go back to the parent: the two source tables should exist there
    OUTPUT.images.by_hash(new_head.parent_id).checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    new_head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
Exemplo n.º 28
0
def test_import_with_custom_query(pg_repo_local):
    # Test that importing with a custom query creates a new object
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise');"
                          "INSERT INTO vegetables VALUES (3, 'oregano')")
    pg_repo_local.commit()

    all_current_objects = pg_repo_local.objects.get_all_objects()

    execute_commands(load_splitfile("import_with_custom_query.splitfile"),
                     output=OUTPUT)
    head = OUTPUT.head
    old_head = OUTPUT.images.by_hash(head.parent_id)

    # First two tables imported as new objects since they had a custom query, the other two get pointed
    # to the old pg_repo_local objects.
    tables = ["my_fruits", "o_vegetables", "vegetables", "all_fruits"]
    contents = [
        [(2, "orange")],
        [(1, "potato"), (3, "oregano")],
        [(1, "potato"), (2, "carrot"), (3, "oregano")],
        [(1, "apple"), (2, "orange"), (3, "mayonnaise")],
    ]

    old_head.checkout()
    engine = OUTPUT.engine
    for t in tables:
        assert not engine.table_exists(OUTPUT.to_schema(), t)

    head.checkout()
    for t, c in zip(tables, contents):
        assert sorted(OUTPUT.run_sql("SELECT * FROM %s" % t)) == sorted(c)

    for t in tables:
        objects = head.get_table(t).objects
        if t in ["my_fruits", "o_vegetables"]:
            assert all(o not in all_current_objects for o in objects)
        else:
            assert all(o in all_current_objects for o in objects)
Exemplo n.º 29
0
def test_import_updating_splitfile_with_uploading(local_engine_empty,
                                                  remote_engine,
                                                  pg_repo_remote):
    execute_commands(load_splitfile("import_and_update.splitfile"),
                     output=OUTPUT)
    head = OUTPUT.head

    assert len(OUTPUT.objects.get_all_objects()
               ) == 4  # Two original tables + two updates

    # Push with upload. Have to specify the remote repo.
    remote_output = Repository(OUTPUT.namespace, OUTPUT.repository,
                               remote_engine)
    OUTPUT.push(remote_output, handler="S3", handler_options={})
    # Unmount everything locally and cleanup
    OUTPUT.delete()

    # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object
    # manager for the engine (maybe should inject it into local_engine/remote_engine instead)
    OUTPUT.objects.cleanup()
    assert not OUTPUT.objects.get_all_objects()

    clone(OUTPUT.to_schema(), download_all=False)

    assert not OUTPUT.objects.get_downloaded_objects()
    existing_objects = list(OUTPUT.objects.get_all_objects())
    assert len(existing_objects) == 4  # Two original tables + two updates
    # Only 2 objects are stored externally (the other two have been on the remote the whole time)
    assert len(
        OUTPUT.objects.get_external_object_locations(existing_objects)) == 2

    head.checkout()
    assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]
Exemplo n.º 30
0
def test_bloom_reindex_changed_table(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)"
    )
    for i in range(26):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)",
                       (i + 1, chr(ord("a") + i), i * 2))
    OUTPUT.commit(chunk_size=13)
    OUTPUT.run_sql("DELETE FROM test WHERE key = 5")  # ('e', 8)

    head = OUTPUT.commit()
    objects = head.get_table("test").objects
    assert len(objects) == 3  # original 2 fragments and one overwrite

    index_spec = {
        "bloom": {
            "value_1": {
                "probability": 0.01
            },
            "value_2": {
                "probability": 0.01
            }
        }
    }
    # Since the patch object deletes the old value, we don't immediately know what it was and so
    # can't reindex that object.
    with pytest.raises(ObjectIndexingError) as e:
        head.get_table("test").reindex(extra_indexes=index_spec)
    assert "1 object" in str(e.value)

    reindexed = head.get_table("test").reindex(extra_indexes=index_spec,
                                               raise_on_patch_objects=False)
    assert objects[0] in reindexed
    assert objects[1] in reindexed
    assert objects[2] not in reindexed