예제 #1
0
def test_from_remote(local_engine_empty, pg_repo_remote_multitag):
    # Test running commands that base new datasets on a remote repository.
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v1"},
                     output=OUTPUT)

    new_head = OUTPUT.head
    parent = OUTPUT.images.by_hash(new_head.parent_id)
    # Go back to the parent: the two source tables should exist there
    parent.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    new_head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]

    # Now run the same splitfile but from the v2 of the remote (where row 1 has been removed from the fruits table)
    # First, remove the output mountpoint (the executor tries to fetch the commit 0000 from it otherwise which
    # doesn't exist).
    OUTPUT.delete()
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": "v2"},
                     output=OUTPUT)

    assert OUTPUT.run_sql("SELECT * FROM join_table") == [(2, "orange",
                                                           "carrot")]
예제 #2
0
def test_local_import_splitfile(pg_repo_local):
    execute_commands(load_splitfile("import_local.splitfile"), output=OUTPUT)
    head = OUTPUT.head
    old_head = head.parent_id

    OUTPUT.images.by_hash(old_head).checkout()
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")

    head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
예제 #3
0
def test_from_remote_hash(local_engine_empty, pg_repo_remote):
    head = pg_repo_remote.head.image_hash
    # Test running commands that base new datasets on a remote repository.
    execute_commands(load_splitfile("from_remote.splitfile"),
                     params={"TAG": head[:10]},
                     output=OUTPUT)

    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
예제 #4
0
def test_rollback_on_error(local_engine_empty):
    # For e.g. commit/checkout/other commands, we don't do commits/rollbacks
    # in the library itself and expect the caller to manage transactions. In CLI,
    # we need to make sure that erroneous transactions (e.g. interrupted SG commits)
    # are rolled back correctly instead of being committed.
    runner = CliRunner()

    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key INTEGER PRIMARY KEY, value_1 VARCHAR, value_2 INTEGER)"
    )
    for i in range(11):
        OUTPUT.run_sql("INSERT INTO test VALUES (%s, %s, %s)",
                       (i + 1, chr(ord("a") + i), i * 2))
    head = OUTPUT.commit(chunk_size=5,
                         in_fragment_order={"test": ["key", "value_1"]})
    assert len(OUTPUT.images()) == 2
    assert len(OUTPUT.objects.get_all_objects()) == 3

    _alter_diff_splitting_dataset()
    OUTPUT.commit_engines()

    # Simulate the commit getting interrupted by the first object going through and being
    # recorded, then a KeyboardInterrupt being raised.
    called_once = False

    def interrupted_register(*args, **kwargs):
        nonlocal called_once
        if called_once:
            raise BaseException("something went wrong")
        else:
            called_once = True
            return FragmentManager._register_object(*args, **kwargs)

    with patch(
            "splitgraph.core.fragment_manager.FragmentManager._register_object",
            side_effect=interrupted_register,
    ) as ro:
        with pytest.raises(BaseException):
            runner.invoke(cli, ["commit", OUTPUT.to_schema()])

    # Check that no image/object metadata was written
    assert len(OUTPUT.images()) == 2
    assert len(OUTPUT.objects.get_all_objects()) == 3

    assert ro.call_count == 2

    # Check that the data in the audit trigger wasn't deleted
    assert len(
        OUTPUT.engine.get_pending_changes(OUTPUT.to_schema(),
                                          table="test")) == 6
예제 #5
0
def test_import_preserves_pending_changes(pg_repo_local):
    OUTPUT.init()
    OUTPUT.run_sql("""CREATE TABLE test (id integer, name varchar);
            INSERT INTO test VALUES (1, 'test')""")
    head = OUTPUT.commit()
    OUTPUT.run_sql("INSERT INTO test VALUES (2, 'test2')")
    changes = get_engine().get_pending_changes(OUTPUT.to_schema(), "test")

    OUTPUT.import_tables(tables=["imported_fruits"],
                         source_repository=pg_repo_local,
                         source_tables=["fruits"])

    assert OUTPUT.head.parent_id == head.image_hash
    assert changes == OUTPUT.engine.get_pending_changes(
        OUTPUT.to_schema(), "test")
예제 #6
0
def test_import_preserves_existing_tables(pg_repo_local):
    # Create a new schema and import 'fruits' from the mounted PG table.
    head = _setup_dataset()
    OUTPUT.import_tables(tables=["imported_fruits"],
                         source_repository=pg_repo_local,
                         source_tables=["fruits"])
    new_head = OUTPUT.head

    head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "test")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(),
                                          "imported_fruits")

    new_head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "test")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "imported_fruits")
예제 #7
0
def test_import_and_update(local_engine_empty, unprivileged_pg_repo):
    OUTPUT.init()
    head = OUTPUT.head
    remote_head = unprivileged_pg_repo.images["latest"]
    # Import the 'fruits' table from the origin.
    import_table_from_remote(unprivileged_pg_repo, ["fruits"],
                             remote_head.image_hash,
                             OUTPUT,
                             target_tables=[])
    new_head = OUTPUT.head

    OUTPUT.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise')")
    new_head_2 = OUTPUT.commit()

    head.checkout()
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")

    new_head.checkout()
    assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"),
                                                      (2, "orange")]

    new_head_2.checkout()
    assert OUTPUT.run_sql("SELECT * FROM fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]
예제 #8
0
def test_from_local(pg_repo_local):
    execute_commands(load_splitfile("from_local.splitfile"), output=OUTPUT)

    new_head = OUTPUT.head
    # Go back to the parent: the two source tables should exist there
    OUTPUT.images.by_hash(new_head.parent_id).checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    new_head.checkout()
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert OUTPUT.run_sql("SELECT * FROM join_table") == [
        (1, "apple", "potato"),
        (2, "orange", "carrot"),
    ]
예제 #9
0
def test_advanced_splitfile(pg_repo_local):
    execute_commands(
        load_splitfile("import_local_multiple_with_queries.splitfile"),
        output=OUTPUT)

    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "my_fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "vegetables")
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "fruits")
    assert OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")

    head = OUTPUT.head
    old_head = head.parent_id
    OUTPUT.images.by_hash(old_head).checkout()
    assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), "join_table")
    head.checkout()
    assert OUTPUT.run_sql("SELECT id, fruit, vegetable FROM join_table") == [
        (2, "orange", "carrot")
    ]
    assert OUTPUT.run_sql("SELECT * FROM my_fruits") == [(2, "orange")]
예제 #10
0
def test_import_from_remote(local_engine_empty, unprivileged_pg_repo):
    # Start with a clean repo -- add a table to output to see if it's preserved.
    head = _setup_dataset()

    local_objects = OUTPUT.objects

    assert len(local_objects.get_downloaded_objects()) == 2
    assert len(local_objects.get_all_objects()) == 2
    assert local_engine_empty.get_all_tables(OUTPUT.to_schema()) == ["test"]

    # Import the 'fruits' table from the origin.
    remote_head = unprivileged_pg_repo.images["latest"]
    import_table_from_remote(unprivileged_pg_repo, ["fruits"],
                             remote_head.image_hash,
                             OUTPUT,
                             target_tables=[])
    new_head = OUTPUT.head

    # Check that the table now exists in the output, is committed and there's no trace of the cloned repo.
    # Also clean up the unused objects to make sure that the newly cloned table is still recorded.
    assert sorted(local_engine_empty.get_all_tables(
        OUTPUT.to_schema())) == ["fruits", "test"]
    local_objects.cleanup()
    assert len(get_current_repositories(local_engine_empty)) == 1
    head.checkout()
    assert local_engine_empty.table_exists(OUTPUT.to_schema(), "test")
    assert not local_engine_empty.table_exists(OUTPUT.to_schema(), "fruits")

    new_head.checkout()
    assert local_engine_empty.table_exists(OUTPUT.to_schema(), "test")
    assert local_engine_empty.table_exists(OUTPUT.to_schema(), "fruits")

    assert OUTPUT.run_sql("SELECT * FROM fruits") == [(1, "apple"),
                                                      (2, "orange")]
예제 #11
0
def test_import_all(local_engine_empty):
    execute_commands(load_splitfile("import_all_from_mounted.splitfile"),
                     output=OUTPUT)

    head = OUTPUT.head
    old_head = OUTPUT.images.by_hash(head.parent_id)

    old_head.checkout()
    tables = ["vegetables", "fruits"]
    contents = [[(1, "potato"), (2, "carrot")], [(1, "apple"), (2, "orange")]]
    for t in tables:
        assert not OUTPUT.engine.table_exists(OUTPUT.to_schema(), t)

    head.checkout()
    for t, c in zip(tables, contents):
        assert OUTPUT.run_sql("SELECT * FROM %s" % t) == c
예제 #12
0
def test_import_with_custom_query(pg_repo_local):
    # Test that importing with a custom query creates a new object
    pg_repo_local.run_sql("INSERT INTO fruits VALUES (3, 'mayonnaise');"
                          "INSERT INTO vegetables VALUES (3, 'oregano')")
    pg_repo_local.commit()

    all_current_objects = pg_repo_local.objects.get_all_objects()

    execute_commands(load_splitfile("import_with_custom_query.splitfile"),
                     output=OUTPUT)
    head = OUTPUT.head
    old_head = OUTPUT.images.by_hash(head.parent_id)

    # First two tables imported as new objects since they had a custom query, the other two get pointed
    # to the old pg_repo_local objects.
    tables = ["my_fruits", "o_vegetables", "vegetables", "all_fruits"]
    contents = [
        [(2, "orange")],
        [(1, "potato"), (3, "oregano")],
        [(1, "potato"), (2, "carrot"), (3, "oregano")],
        [(1, "apple"), (2, "orange"), (3, "mayonnaise")],
    ]

    old_head.checkout()
    engine = OUTPUT.engine
    for t in tables:
        assert not engine.table_exists(OUTPUT.to_schema(), t)

    head.checkout()
    for t, c in zip(tables, contents):
        assert sorted(OUTPUT.run_sql("SELECT * FROM %s" % t)) == sorted(c)

    for t in tables:
        objects = head.get_table(t).objects
        if t in ["my_fruits", "o_vegetables"]:
            assert all(o not in all_current_objects for o in objects)
        else:
            assert all(o in all_current_objects for o in objects)
예제 #13
0
def test_import_updating_splitfile_with_uploading(local_engine_empty,
                                                  remote_engine,
                                                  pg_repo_remote):
    execute_commands(load_splitfile("import_and_update.splitfile"),
                     output=OUTPUT)
    head = OUTPUT.head

    assert len(OUTPUT.objects.get_all_objects()
               ) == 4  # Two original tables + two updates

    # Push with upload. Have to specify the remote repo.
    remote_output = Repository(OUTPUT.namespace, OUTPUT.repository,
                               remote_engine)
    OUTPUT.push(remote_output, handler="S3", handler_options={})
    # Unmount everything locally and cleanup
    OUTPUT.delete()

    # OUTPUT doesn't exist but we use its ObjectManager reference to access the global object
    # manager for the engine (maybe should inject it into local_engine/remote_engine instead)
    OUTPUT.objects.cleanup()
    assert not OUTPUT.objects.get_all_objects()

    clone(OUTPUT.to_schema(), download_all=False)

    assert not OUTPUT.objects.get_downloaded_objects()
    existing_objects = list(OUTPUT.objects.get_all_objects())
    assert len(existing_objects) == 4  # Two original tables + two updates
    # Only 2 objects are stored externally (the other two have been on the remote the whole time)
    assert len(
        OUTPUT.objects.get_external_object_locations(existing_objects)) == 2

    head.checkout()
    assert OUTPUT.run_sql("SELECT fruit_id, name FROM my_fruits") == [
        (1, "apple"),
        (2, "orange"),
        (3, "mayonnaise"),
    ]
예제 #14
0
def test_range_index_ordering_collation(local_engine_empty):
    # Test that range index gets min/max values of text columns using the "C" collation
    # (sort by byte values of characters) rather than anything else (e.g. in en-US
    # "a" comes before "B" even though "B" has a smaller ASCII code).

    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key_1 INTEGER, key_2 VARCHAR,"
        " value_1 VARCHAR, value_2 INTEGER, PRIMARY KEY (key_1, key_2))")

    OUTPUT.engine.run_sql_batch(
        "INSERT INTO test VALUES (%s, %s, %s, %s)",
        [
            (1, "ONE", "apple", 4),
            (1, "one", "ORANGE", 3),
            (2, "two", "banana", 2),
            (2, "TWO", "CUCUMBER", 1),
        ],
        schema=OUTPUT.to_schema(),
    )

    head = OUTPUT.commit()
    object_id = head.get_table("test").objects[0]

    assert OUTPUT.objects.get_object_meta([object_id
                                           ])[object_id].object_index == {
                                               "range": {
                                                   "$pk": [[1, "ONE"],
                                                           [2, "two"]],
                                                   "key_1": [1, 2],
                                                   "key_2": ["ONE", "two"],
                                                   "value_1":
                                                   ["CUCUMBER", "banana"],
                                                   "value_2": [1, 4],
                                               }
                                           }
예제 #15
0
def test_diff_fragment_hashing_long_chain(local_engine_empty):
    OUTPUT.init()
    OUTPUT.run_sql(
        "CREATE TABLE test (key TIMESTAMP PRIMARY KEY, val1 INTEGER, val2 VARCHAR, val3 NUMERIC)"
    )
    OUTPUT.run_sql(
        "INSERT INTO TEST VALUES ('2019-01-01 01:01:01.111', 1, 'one', 1.1),"
        "('2019-01-02 02:02:02.222', 2, 'two', 2.2),"
        "('2019-01-03 03:03:03.333', 3, 'three', 3.3),"
        "('2019-01-04 04:04:04.444', 4, 'four', 4.4)")
    OUTPUT.commit()
    base = OUTPUT.head.get_table("test")

    OUTPUT.run_sql("DELETE FROM test WHERE key = '2019-01-03 03:03:03.333';"
                   "INSERT INTO test VALUES ('2019-01-05', 5, 'five', 5.5)")
    OUTPUT.commit()
    v1 = OUTPUT.head.get_table("test")

    OUTPUT.run_sql(
        "UPDATE test SET val2 = 'UPDATED', val1 = 42 WHERE key = '2019-01-02 02:02:02.222'"
    )
    OUTPUT.commit()
    v2 = OUTPUT.head.get_table("test")

    OUTPUT.run_sql(
        "UPDATE test SET val2 = 'UPDATED AGAIN', val1 = 43 WHERE key = '2019-01-02 02:02:02.222'"
    )
    OUTPUT.commit()
    v3 = OUTPUT.head.get_table("test")

    om = OUTPUT.objects
    final_hash, rows_inserted = OUTPUT.objects.calculate_content_hash(
        OUTPUT.to_schema(), "test")
    assert rows_inserted == 4

    schema_hash = om._calculate_schema_hash(base.table_schema)

    # Check that the final hash can be assembled out of intermediate objects' insertion and deletion hashes

    ins_hash_base, rows_inserted = om.calculate_fragment_insertion_hash_stats(
        SPLITGRAPH_META_SCHEMA, base.objects[0])
    assert ("o" + sha256(
        (ins_hash_base.hex() +
         schema_hash).encode("ascii")).hexdigest()[:-2] == base.objects[-1])
    assert rows_inserted == 4

    ins_hash_v1, rows_inserted = om.calculate_fragment_insertion_hash_stats(
        SPLITGRAPH_META_SCHEMA, v1.objects[-1])
    assert rows_inserted == 1

    # timestamp cast to text in a tuple is wrapped with double quotes in PG.
    # As long as hashing is consistent (this happens with all engines no matter what their conventions are),
    # we don't really mind but this might cause some weird issues later with verifying hashes/deduplication.
    del_hash_v1 = Digest.from_hex(
        sha256('("2019-01-03 03:03:03.333",3,three,3.3)'.encode(
            "ascii")).hexdigest())
    assert del_hash_v1.hex(
    ) == "b12a93d54ba7ff1c2e26c92f01ac9c9d7716242eb47344d57c89b481227f5298"

    # Check that the object metadata contains the same hashes.
    v1_meta = om.get_object_meta(v1.objects)[v1.objects[-1]]
    assert ins_hash_v1.hex() == v1_meta.insertion_hash
    assert del_hash_v1.hex() == v1_meta.deletion_hash

    assert ("o" + sha256(
        ((ins_hash_v1 - del_hash_v1).hex() +
         schema_hash).encode("ascii")).hexdigest()[:-2] == v1.objects[-1])

    ins_hash_v2, rows_inserted = om.calculate_fragment_insertion_hash_stats(
        SPLITGRAPH_META_SCHEMA, v2.objects[-1])
    del_hash_v2 = Digest.from_hex(
        sha256('("2019-01-02 02:02:02.222",2,two,2.2)'.encode(
            "ascii")).hexdigest())
    assert del_hash_v2.hex(
    ) == "88e01be43523057d192b2fd65e69f651a9515b7e30d17a9fb852926b71e3bdff"
    assert ins_hash_v2.hex() == om.get_object_meta(
        v2.objects)[v2.objects[-1]].insertion_hash
    assert ("o" + sha256(
        ((ins_hash_v2 - del_hash_v2).hex() +
         schema_hash).encode("ascii")).hexdigest()[:-2] == v2.objects[-1])
    assert rows_inserted == 1

    v2_meta = om.get_object_meta(v2.objects)[v2.objects[-1]]
    assert ins_hash_v2.hex() == v2_meta.insertion_hash
    assert del_hash_v2.hex() == v2_meta.deletion_hash

    ins_hash_v3, rows_inserted = om.calculate_fragment_insertion_hash_stats(
        SPLITGRAPH_META_SCHEMA, v3.objects[-1])
    del_hash_v3 = Digest.from_hex(
        sha256('("2019-01-02 02:02:02.222",42,UPDATED,2.2)'.encode(
            "ascii")).hexdigest())
    assert ("o" + sha256(
        ((ins_hash_v3 - del_hash_v3).hex() +
         schema_hash).encode("ascii")).hexdigest()[:-2] == v3.objects[-1])

    v3_meta = om.get_object_meta(v3.objects)[v3.objects[-1]]
    assert ins_hash_v3.hex() == v3_meta.insertion_hash
    assert del_hash_v3.hex() == v3_meta.deletion_hash
    assert rows_inserted == 1

    assert (ins_hash_base + ins_hash_v1 + ins_hash_v2 + ins_hash_v3 -
            del_hash_v1 - del_hash_v2 - del_hash_v3).hex() == final_hash