예제 #1
0
def test_read_or_predicates(store_factory, partition_on):
    # https://github.com/JDASoftwareGroup/kartothek/issues/295
    dataset_uuid = "test"
    df = pd.DataFrame({"A": range(10), "B": ["A", "B"] * 5, "C": range(-10, 0)})

    store_dataframes_as_dataset(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        dfs=[df],
        partition_on=partition_on,
    )

    df1 = read_table(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        predicates=[[("A", "<", 3)], [("A", ">", 5)], [("B", "==", "non-existent")]],
    )

    df2 = read_table(
        store=store_factory,
        dataset_uuid=dataset_uuid,
        predicates=[[("A", "<", 3)], [("A", ">", 5)]],
    )
    expected = pd.DataFrame(
        data={
            "A": [0, 1, 2, 6, 7, 8, 9],
            "B": ["A", "B", "A", "A", "B", "A", "B"],
            "C": [-10, -9, -8, -4, -3, -2, -1],
        },
    )

    pd.testing.assert_frame_equal(df1, df2)
    pd.testing.assert_frame_equal(expected, df2)
예제 #2
0
def assert_target_ktk_readable(tgt_store, tgt_ds):
    """
    Try to read the target dataset using high level KTK functionality
    """
    df_result = read_table(store=tgt_store, dataset_uuid=tgt_ds,)
    assert df_result is not None
    assert len(df_result) == 10
    df_result = read_table(
        store=tgt_store, dataset_uuid=tgt_ds, predicates=[[("bool", "==", True)]]
    )
    assert len(df_result) == 5
    df_result = read_table(
        store=tgt_store, dataset_uuid=tgt_ds, predicates=[[("bytes", "==", b"2")]]
    )
    assert len(df_result) == 1
예제 #3
0
def test_initial_commit(store):
    dataset_uuid = "dataset_uuid"
    df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])]))
    dataset = create_empty_dataset_header(
        store=store,
        table_meta={"core": make_meta(df, origin="1")},
        dataset_uuid=dataset_uuid,
        metadata_version=4,
    )
    assert dataset.explicit_partitions is False
    new_data = {"data": {"core": df}}
    new_metapartition = write_single_partition(store=store,
                                               dataset_uuid=dataset.uuid,
                                               data=new_data)

    new_partition = [{
        "label": new_metapartition.label,
        "data": [("core", None)]
    }]
    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset.uuid,
        new_partitions=new_partition,
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset.explicit_partitions is True
    actual = read_table(store=store,
                        table="core",
                        dataset_uuid=updated_dataset.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])]))

    assert_frame_equal(df_expected, actual)
예제 #4
0
def test_initial_commit(store):
    dataset_uuid = "dataset_uuid"
    df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])]))
    dataset = create_empty_dataset_header(
        store=store,
        schema=make_meta(df, origin="1"),
        dataset_uuid=dataset_uuid,
        metadata_version=4,
    )
    assert dataset.explicit_partitions is False
    new_metapartition = write_single_partition(store=store,
                                               dataset_uuid=dataset.uuid,
                                               data=df)

    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset.uuid,
        # FIXME: is this breaking and if so, is it expected?
        new_partitions=[new_metapartition],
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset.explicit_partitions is True
    actual = read_table(store=store, dataset_uuid=updated_dataset.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])]))

    assert_frame_equal(df_expected, actual)
예제 #5
0
def test_commit_dataset_from_metapartition(dataset_function, store):
    new_data = [
        pd.DataFrame(
            OrderedDict([
                ("P", [5]),
                ("L", [5]),
                ("TARGET", [5]),
                ("DATE", [datetime.date(2016, 3, 23)]),
            ]))
    ]
    new_partition = write_single_partition(store=store,
                                           dataset_uuid=dataset_function.uuid,
                                           data=new_data)
    pre_commit_dataset = DatasetMetadata.load_from_store(
        uuid=dataset_function.uuid, store=store)
    # Cannot assert equal since the metadata is differently ordered
    assert pre_commit_dataset == dataset_function

    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset_function.uuid,
        new_partitions=new_partition,
        delete_scope=None,
        partition_on=None,
    )
    assert updated_dataset != dataset_function

    assert updated_dataset.uuid == dataset_function.uuid
    assert len(
        updated_dataset.partitions) == len(dataset_function.partitions) + 1

    # ensure that the new dataset is actually the one on disc
    loaded_dataset = DatasetMetadata.load_from_store(uuid=updated_dataset.uuid,
                                                     store=store)
    assert loaded_dataset == updated_dataset

    # Read the data and check whether the rows above are included.
    # This checks whether all necessary informations were updated in the header
    # (e.g. files attributes of the partitions)
    actual = read_table(store=store, dataset_uuid=dataset_function.uuid)
    df_expected = pd.DataFrame(
        OrderedDict([
            (
                "DATE",
                [
                    datetime.date(2016, 3, 23),
                    datetime.date(2010, 1, 1),
                    datetime.date(2009, 12, 31),
                ],
            ),
            ("L", [5, 1, 2]),
            ("P", [5, 1, 2]),
            ("TARGET", [5, 1, 2]),
        ]))
    actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True)

    assert_frame_equal(df_expected, actual)
예제 #6
0
def _read_table(*args, **kwargs):
    kwargs.pop("dispatch_by", None)
    res = read_table(*args, **kwargs)

    if len(res):
        # Array split conserves dtypes
        return np.array_split(res, len(res))
    else:
        return [res]
예제 #7
0
def _read_table(*args, **kwargs):
    if "tables" in kwargs:
        kwargs.pop("tables")
    res = read_table(*args, table="core", **kwargs)

    if len(res):
        # Array split conserves dtypes
        return np.array_split(res, len(res))
    else:
        return [res]
예제 #8
0
def _read_table(*args, **kwargs):
    if "tables" in kwargs:
        param_tables = kwargs.pop("tables")
        kwargs["table"] = param_tables
    res = read_table(*args, **kwargs)

    if len(res):
        # Array split conserves dtypes
        return np.array_split(res, len(res))
    else:
        return [res]
예제 #9
0
def test_read_table_with_columns(dataset, store_session):
    df = read_table(
        store=store_session, dataset_uuid="dataset_uuid", columns=["P", "L"],
    )

    expected_df = pd.DataFrame({"P": [1, 2], "L": [1, 2]})

    # No stability of partitions
    df = df.sort_values(by="P").reset_index(drop=True)
    expected_df = expected_df.sort_values(by="P").reset_index(drop=True)

    pdt.assert_frame_equal(df, expected_df, check_dtype=False, check_like=True)
예제 #10
0
def test_non_default_table_name_roundtrip(store_factory,
                                          bound_store_dataframes):
    df = pd.DataFrame({"A": [1]})
    bound_store_dataframes([df],
                           store=store_factory,
                           dataset_uuid="dataset_uuid",
                           table_name="foo")
    for k in store_factory():
        if k.endswith(".parquet") and "indices" not in k:
            assert "foo" in k
    result = read_table(dataset_uuid="dataset_uuid", store=store_factory)

    pdt.assert_frame_equal(df, result)
예제 #11
0
def test_read_table_simple_list_for_cols_cats(dataset, store_session):
    df = read_table(
        store=store_session,
        dataset_uuid="dataset_uuid",
        table=SINGLE_TABLE,
        columns=["P", "L"],
        categoricals=["P", "L"],
    )

    expected_df = pd.DataFrame({"P": [1, 2], "L": [1, 2]})

    # No stability of partitions
    df = df.sort_values(by="P").reset_index(drop=True)
    expected_df = expected_df.sort_values(by="P").reset_index(drop=True)

    expected_df = expected_df.astype("category")

    pdt.assert_frame_equal(df, expected_df, check_dtype=False, check_like=True)
예제 #12
0
def test_update_respects_ktk_cube_dataset_ids(
    driver, function_store, ktk_cube_dataset_ids
):
    df_source, cube = _write_cube(function_store)
    df_ex = _extend_cube(cube, function_store)

    remove_conditions = C("p") == 0

    # This implicitly also tests that `data={}` behaves as expected and still deletes partitions
    # as requested via ktk_cube_dataset_ids and remove_conditions
    result = driver(
        data={},
        remove_conditions=remove_conditions,
        cube=cube,
        store=function_store,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
    )
    assert set(result) == ktk_cube_dataset_ids
    df_read = query_cube(cube, function_store)[0]

    # expected result: df_source left joined with df_ex; choosing the subset of p!=0 from each
    # that is in `ktk_cube_dataset_ids`:
    if "source" in ktk_cube_dataset_ids:
        df_source = df_source.loc[df_source["p"] != 0]
    if "ex" in ktk_cube_dataset_ids:
        df_ex = df_ex.loc[df_ex["p"] != 0]
    df_expected = df_source.merge(df_ex[["x", "a"]], how="left", on="x")
    df_expected = df_expected[sorted(df_expected.columns)]
    pd.testing.assert_frame_equal(df_read, df_expected)

    # test "ex" separately, because the test above based on the *left* merge does not tell us much about
    # "ex" in case the partitions were removed from "source"
    df_ex_read = read_table(cube.ktk_dataset_uuid("ex"), function_store)
    if "ex" in ktk_cube_dataset_ids:
        assert set(df_ex_read["p"]) == {1}
    else:
        assert set(df_ex_read["p"]) == {0, 1}
예제 #13
0
def test_read_table_eager(dataset, store_session, use_categoricals):
    if use_categoricals:
        categories = ["P"]
    else:
        categories = None

    df = read_table(
        store=store_session, dataset_uuid="dataset_uuid", categoricals=categories,
    )
    expected_df = pd.DataFrame(
        {
            "P": [1, 2],
            "L": [1, 2],
            "TARGET": [1, 2],
            "DATE": [datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)],
        }
    )
    if categories:
        expected_df = expected_df.astype({"P": "category"})

    # No stability of partitions
    df = df.sort_values(by="P").reset_index(drop=True)

    pdt.assert_frame_equal(df, expected_df, check_dtype=True, check_like=True)
예제 #14
0
def test_read_table_eager(dataset, store_session, use_categoricals):
    if use_categoricals:
        categories = {"core": ["P"]}
    else:
        categories = None

    df = read_table(
        store=store_session,
        dataset_uuid="dataset_uuid",
        table="core",
        categoricals=categories,
    )
    expected_df = pd.DataFrame({
        "P": [1, 2],
        "L": [1, 2],
        "TARGET": [1, 2],
        "DATE":
        pd.to_datetime(
            [datetime.date(2010, 1, 1),
             datetime.date(2009, 12, 31)]),
    })
    if categories:
        expected_df = expected_df.astype({"P": "category"})

    # No stability of partitions
    df = df.sort_values(by="P").reset_index(drop=True)

    pdt.assert_frame_equal(df, expected_df, check_dtype=True, check_like=True)

    df_2 = read_table(store=store_session,
                      dataset_uuid="dataset_uuid",
                      table="helper")
    expected_df_2 = pd.DataFrame({"P": [1, 2], "info": ["a", "b"]})

    assert isinstance(df_2, pd.DataFrame)

    # No stability of partitions
    df_2 = df_2.sort_values(by="P").reset_index(drop=True)

    pdt.assert_frame_equal(df_2,
                           expected_df_2,
                           check_dtype=True,
                           check_like=True)

    df_3 = read_table(
        store=store_session,
        dataset_uuid="dataset_uuid",
        table="helper",
        predicates=[[("P", "==", 2)]],
    )
    expected_df_3 = pd.DataFrame({"P": [2], "info": ["b"]})

    assert isinstance(df_3, pd.DataFrame)
    pdt.assert_frame_equal(df_3,
                           expected_df_3,
                           check_dtype=True,
                           check_like=True)

    df_4 = read_table(
        store=store_session,
        dataset_uuid="dataset_uuid",
        table="helper",
        predicates=[[("info", "==", "a")]],
    )
    expected_df_4 = pd.DataFrame({"P": [1], "info": ["a"]})

    assert isinstance(df_4, pd.DataFrame)

    pdt.assert_frame_equal(df_4,
                           expected_df_4,
                           check_dtype=True,
                           check_like=True)
예제 #15
0
from functools import partial
from tempfile import TemporaryDirectory
from storefact import get_store_from_url
from kartothek.io.eager import read_table


dataset_dir = TemporaryDirectory()
store_factory = partial(get_store_from_url, "hfs:///Users/1019021/Learn/python-python-parquet/resources")

print(read_table("order_proposals_a6e8aef43203", store_factory, table="order_proposals"))
예제 #16
0
from storefact import get_store_from_url
from kartothek.io.eager import read_table
from kartothek.io.iter import read_dataset_as_dataframes__iterator
from kartothek.io.dask.delayed import read_dataset_as_delayed
import pandas as pd
import dask

percent_encoded_sas_token = ""
store_factory = partial(
    get_store_from_url,
    f"hazure://*****:*****@ktkfiles?use_sas=true&create_if_missing=false"
)
# Approach 1
# read all at once
df = read_table(dataset_uuid="order_proposals_a6e8aef43203",
                store=store_factory,
                table="order_proposals")
# write aggregated df to disk
df.to_parquet('sample.python-parquet', engine='pyarrow')

# Approach 2
# read iteratively
df_frames = pd.DataFrame()
for partition_index, df_dict in enumerate(
        read_dataset_as_dataframes__iterator(
            dataset_uuid="order_proposals_a6e8aef43203", store=store_factory)):
    # print(f"Partition #{partition_index}")
    for table_name, table_df in df_dict.items():
        # print(f"Table: {table_name}. Data: \n{table_df}")
        df_frames = df_frames.append(table_df)
# write aggregated df to disk