Пример #1
0
def test_additional_files(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(data=df_seed, cube=cube, store=function_store)

    key_in_ds = cube.ktk_dataset_uuid(cube.seed_dataset) + "/foo"
    key_with_ds_prefix = cube.ktk_dataset_uuid(cube.seed_dataset) + ".foo"
    key_with_cube_prefix = cube.uuid_prefix + ".foo"
    key_with_cube_prefix_separator = cube.uuid_prefix + KTK_CUBE_UUID_SEPARATOR + ".foo"

    function_store().put(key_in_ds, b"")
    function_store().put(key_with_ds_prefix, b"")
    function_store().put(key_with_cube_prefix, b"")
    function_store().put(key_with_cube_prefix_separator, b"")

    driver(cube=cube, store=function_store)
    assert key_in_ds not in set(function_store().keys())
    assert key_with_ds_prefix not in set(function_store().keys())
    assert key_with_cube_prefix in set(function_store().keys())
    assert key_with_cube_prefix_separator not in set(function_store().keys())
Пример #2
0
def test_split(driver, function_store):
    """
    Imagine the user already splits the data.
    """
    df_source1 = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v1": [10, 11]})
    df_source2 = pd.DataFrame({"x": [2, 3], "p": [1, 1], "v1": [12, 13]})
    df_enrich = pd.DataFrame({"x": [0, 1], "p": [0, 0], "v2": [20, 21]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    result = driver(
        data=[{"source": df_source1, "enrich": df_enrich}, df_source2],
        cube=cube,
        store=function_store,
    )

    assert set(result.keys()) == {cube.seed_dataset, "enrich"}

    ds_source = result[cube.seed_dataset].load_all_indices(function_store())
    ds_enrich = result["enrich"].load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert ds_enrich.uuid == cube.ktk_dataset_uuid("enrich")

    assert len(ds_source.partitions) == 2
    assert len(ds_enrich.partitions) == 1
Пример #3
0
def test_distinct_branches(driver, function_store):
    """
    Just check this actually works.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    result = driver(
        data=[{"source": df_source}, {"enrich": df_enrich}],
        cube=cube,
        store=function_store,
    )

    assert set(result.keys()) == {cube.seed_dataset, "enrich"}

    ds_source = result[cube.seed_dataset].load_all_indices(function_store())
    ds_enrich = result["enrich"].load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert ds_enrich.uuid == cube.ktk_dataset_uuid("enrich")

    assert len(ds_source.partitions) == 2
    assert len(ds_enrich.partitions) == 2
Пример #4
0
def test_projected_data(driver, function_store):
    """
    Projected dataset (useful for de-duplication).
    """
    df_source = pd.DataFrame(
        {
            "x": [0, 1, 0, 1],
            "y": [0, 0, 1, 1],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
        }
    )
    df_enrich = pd.DataFrame({"y": [0, 1], "p": [0, 1], "v2": [20, 21]})
    cube = Cube(
        dimension_columns=["x", "y"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    result = driver(
        data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store
    )

    assert set(result.keys()) == {cube.seed_dataset, "enrich"}

    ds_source = result[cube.seed_dataset].load_all_indices(function_store())
    ds_enrich = result["enrich"].load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert ds_enrich.uuid == cube.ktk_dataset_uuid("enrich")

    assert len(ds_source.partitions) == 2
    assert len(ds_enrich.partitions) == 2
Пример #5
0
def test_fail_wrong_types(driver, function_store):
    """
    Might catch nasty pandas and other type bugs.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0.0, 1.0, 2.0, 3.0], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )

    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert 'Found incompatible entries for column "x"' in str(cause)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #6
0
def test_delayed_index_build_correction_restriction(driver, function_store):
    """
    Ensure that adding extra indices for dimension columns does not mark other datasets as restrictive.
    """
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2]})
    df_extend = pd.DataFrame({"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 1, 2]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["x"],
    )

    results = driver(cube=cube, store=function_store, conditions=C("x") >= 0)
    assert len(results) == 1

    df_actual = results[0]
    df_expected = pd.DataFrame(
        {
            "x": [0, 1, 2, 3, 4, 5],
            "p": [0, 0, 1, 1, 2, 2],
            "v": [0, 1, 2, np.nan, np.nan, np.nan],
        },
        columns=["p", "v", "x"],
    )
    pdt.assert_frame_equal(df_actual, df_expected)
Пример #7
0
def test_delayed_index_build_partition_by(driver, function_store):
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]})
    df_extend = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [0, 0, 0, 1]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["v"],
    )

    results = driver(cube=cube, store=function_store, partition_by=["v"])
    assert len(results) == 2

    df_result1 = pd.DataFrame(
        data={"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 0, 0]}, columns=["p", "v", "x"]
    )
    df_result2 = pd.DataFrame(
        data={"x": [3], "p": [1], "v": [1]}, columns=["p", "v", "x"]
    )
    pdt.assert_frame_equal(results[0], df_result1)
    pdt.assert_frame_equal(results[1], df_result2)
Пример #8
0
def test_cube_blacklist_dimension_index(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        suppress_index_on=["B"],
    )
    df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["B"].observed_values()

    assert sorted(obs_values) == [1, 2]
Пример #9
0
def test_fail_nondistinc_payload(driver, function_store):
    """
    This would lead to problems during the query phase.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )
    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert "Found columns present in multiple datasets" in str(cause)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #10
0
def test_fail_partition_on_4(driver, function_store):
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    with pytest.raises(
        ValueError, match="Unspecified but provided partition columns in enrich: p"
    ):
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
            partition_on={"enrich": []},
        )
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #11
0
def test_nones(driver, function_store, none_first, driver_name):
    """
    Test what happens if user passes None to ktk_cube.
    """
    if driver_name == "dask_dataframe":
        pytest.skip("user cannot create None-partitions with dask.dataframe")

    df = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]})
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    result = driver(
        data=[None, df] if none_first else [df, None], cube=cube, store=function_store
    )

    assert set(result.keys()) == {cube.seed_dataset}

    ds = list(result.values())[0]
    ds = ds.load_all_indices(function_store())

    assert ds.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert len(ds.partitions) == 2

    assert set(ds.indices.keys()) == {"p", "x"}
    assert isinstance(ds.indices["p"], PartitionIndex)
    assert isinstance(ds.indices["x"], ExplicitSecondaryIndex)

    assert set(ds.table_meta) == {SINGLE_TABLE}
Пример #12
0
def test_simple_two_datasets(driver, function_store):
    """
    Simple intergration test w/ 2 datasets.
    """
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    result = driver(
        data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store
    )

    assert set(result.keys()) == {cube.seed_dataset, "enrich"}

    ds_source = result[cube.seed_dataset].load_all_indices(function_store())
    ds_enrich = result["enrich"].load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert ds_enrich.uuid == cube.ktk_dataset_uuid("enrich")

    assert len(ds_source.partitions) == 2
    assert len(ds_enrich.partitions) == 2

    assert set(ds_source.indices.keys()) == {"p", "x"}
    assert isinstance(ds_source.indices["p"], PartitionIndex)
    assert isinstance(ds_source.indices["x"], ExplicitSecondaryIndex)

    assert set(ds_enrich.indices.keys()) == {
        "p",
    }
    assert isinstance(ds_enrich.indices["p"], PartitionIndex)

    assert set(ds_source.table_meta) == {SINGLE_TABLE}
    assert set(ds_enrich.table_meta) == {SINGLE_TABLE}
Пример #13
0
def test_cube_update_secondary_indices_subset(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["indexed"],
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "indexed": 1, "not-indexed": 1})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_2 = pd.DataFrame({"A": range(10, 20), "P": 1, "indexed": 2, "not-indexed": 1})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["indexed"].observed_values()

    assert sorted(obs_values) == [1, 2]

    cube2 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["not-indexed"],
    )
    with pytest.raises(
        ValueError,
        match='ExplicitSecondaryIndex or PartitionIndex "not-indexed" is missing in dataset',
    ):
        driver(
            data={"source": df_2},
            cube=cube2,
            store=function_store,
            remove_conditions=None,
        )
Пример #14
0
def test_fails_null_dimension(driver, function_store):
    """
    Since we do not allow NULL values in queries, it should be banned from dimension columns in the first place.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, np.nan], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    )
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)

    assert 'Found NULL-values in dimension column "x" of dataset "seed"' in str(exc)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("seed"), function_store())
Пример #15
0
def test_fail_all_empty(driver, function_store):
    """
    Might happen due to DB-based filters.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    ).loc[[]]
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)
    assert "Cannot write empty datasets: seed" in str(exc.value)

    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #16
0
def _store_bag_as_dataset_parallel(
    bag: db.Bag,
    store: KeyValueStore,
    cube: Cube,
    ktk_cube_dataset_ids: Iterable[str],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    existing_datasets,
    overwrite: bool = False,
    update: bool = False,
    delete_scopes=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to
    store datasets in parallel (e.g. from a dict).

    `delete_scope` is a dictionary mapping the kartothek dataset id to the `delete_scope` of the dataset
    (see `update_dataset_from_partitions` for the definition of the single dataset `delete_scope`).
    """
    if (not update) and (not overwrite):
        for ktk_cube_dataset_id in ktk_cube_dataset_ids:
            raise_if_dataset_exists(
                dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
                store=store)

    mps = bag.map(_multiplex_parse_input_to_metapartition)

    # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not
    # required here anymore

    mps = mps.map(_multiplex_store,
                  store=store,
                  cube=cube,
                  df_serializer=df_serializer)

    aggregate = partial(
        _multiplex_store_dataset_from_partitions_flat,
        cube=cube,
        existing_datasets=existing_datasets,
        metadata=metadata,
        store=store,
        update=update,
        delete_scopes=delete_scopes or {},
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False,
                         out_type=db.Bag)
Пример #17
0
def test_fails_projected_duplicates(driver, driver_name, function_store):
    """
    Test if duplicate check also works w/ projected data. (was a regression)
    """
    if driver_name == "dask_dataframe":
        pytest.xfail(reason="Cannot guarantee duplicates for DDF")
    df_source = pd.DataFrame(
        {
            "x": [0, 1, 0, 1],
            "y": [0, 0, 1, 1],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
        }
    )
    df_enrich = pd.DataFrame(
        {"y": [0, 0, 1, 1], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23], "v3": 42}
    )
    cube = Cube(
        dimension_columns=["x", "y"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(ValueError) as exc:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )
    msg = """
Found duplicate cells by [p, y] in dataset "enrich", example:

Keys:
p    0
y    0

Identical Payload:
v3    42

Non-Idential Payload:
   v2
0  20
1  21
""".strip()
    assert msg in str(exc.value)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #18
0
def _multiplex_store(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    result = {}
    for k in sorted(data.keys()):
        v = data.pop(k)
        result[k] = MetaPartition.store_dataframes(
            v,
            dataset_uuid=cube.ktk_dataset_uuid(k),
            df_serializer=df_serializer or KTK_CUBE_DF_SERIALIZER,
            store=store,
        )
        del v
    return result
Пример #19
0
def test_fail_all_empty(driver, driver_name, function_store):
    """
    Might happen due to DB-based filters.
    """
    df = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}
    ).loc[[]]
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")

    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(data=df, cube=cube, store=function_store)
    exc = exc_info.value.__cause__
    assert isinstance(exc, ValueError)
    assert "Cannot write empty datasets" in str(exc)

    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #20
0
def test_fail_partial_build(driver, function_store):
    """
    Either overwrite all or no datasets.
    """
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )

    df_source1 = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13]}
    )
    df_enrich1 = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [20, 21, 22, 23]}
    )
    driver(
        data={"source": df_source1, "enrich": df_enrich1},
        cube=cube,
        store=function_store,
    )

    # delete everything that belongs to the seed dataset
    to_delete = {
        k
        for k in function_store().keys()
        if k.startswith(cube.ktk_dataset_uuid(cube.seed_dataset))
    }
    for k in to_delete:
        function_store().delete(k)

    keys = set(function_store().keys())
    df_source2 = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v3": [10, 11, 12, 13]}
    )
    with pytest.raises(ValueError) as exc:
        driver(data={"source": df_source2}, cube=cube, store=function_store)
    assert (
        str(exc.value)
        == "Following datasets exists but are not overwritten (partial overwrite), this is not allowed: enrich"
    )
    assert set(function_store().keys()) == keys
Пример #21
0
def test_simple_seed_only(driver, function_store):
    """
    Simple integration test w/ a seed dataset only. This is the most simple way to create a cube.
    """
    df = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]})
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    result = driver(data=df, cube=cube, store=function_store)

    assert set(result.keys()) == {cube.seed_dataset}

    ds = list(result.values())[0]
    ds = ds.load_all_indices(function_store())

    assert ds.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert len(ds.partitions) == 2

    assert set(ds.indices.keys()) == {"p", "x"}
    assert isinstance(ds.indices["p"], PartitionIndex)
    assert isinstance(ds.indices["x"], ExplicitSecondaryIndex)

    assert set(ds.table_meta) == {SINGLE_TABLE}
Пример #22
0
def test_fail_duplicates_local(driver, driver_name, function_store):
    """
    Might happen during DB queries.
    """
    if driver_name == "dask_dataframe":
        pytest.xfail(reason="Cannot guarantee duplicates for DDF")
    df = pd.DataFrame(
        {
            "x": [0, 0],
            "y": ["a", "a"],
            "z": [pd.Timestamp("2017"), pd.Timestamp("2017")],
            "p": [0, 0],
        }
    )
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(ValueError) as exc:
        driver(data=df, cube=cube, store=function_store)
    msg = """
Found duplicate cells by [p, x, y, z] in dataset "source", example:

Keys:
p                      0
x                      0
y                      a
z    2017-01-01 00:00:00

Identical Payload:
n/a

Non-Idential Payload:
n/a
""".strip()
    assert msg in str(exc.value)
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("source"), function_store())
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #23
0
def test_ktk_dataset_uuid():
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")

    assert cube.ktk_dataset_uuid(b"foo") == "cube++foo"
    assert isinstance(cube.ktk_dataset_uuid(b"foo"), str)

    with pytest.raises(ValueError) as exc:
        cube.ktk_dataset_uuid("f++")
    assert (str(exc.value) ==
            'ktk_cube_dataset_id ("f++") must not contain UUID separator ++')

    with pytest.raises(ValueError) as exc:
        cube.ktk_dataset_uuid("f ")
    assert (str(exc.value) ==
            'ktk_cube_dataset_id ("f ") is not compatible with kartothek')
Пример #24
0
def test_fails_no_dimension_columns(driver, function_store):
    """
    Ensure that we catch missing dimension columns early.
    """
    df_source = pd.DataFrame({"x": [0, 1], "y": [0, 1], "z": [0, 1], "p": 0})
    df_enrich = pd.DataFrame({"p": [0], "v1": 0})
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    with pytest.raises(ValueError) as exc:
        driver(
            data={"source": df_source, "enrich": df_enrich},
            cube=cube,
            store=function_store,
        )
    assert (
        'Dataset "enrich" must have at least 1 of the following dimension columns: x, y'
        in str(exc.value)
    )
    assert not DatasetMetadata.exists(cube.ktk_dataset_uuid("enrich"), function_store())
Пример #25
0
def test_overwrite_rollback_ktk(driver, function_store):
    """
    Checks that require a rollback (like overlapping columns) should recover the former state correctly.
    """
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3", "i4"],
    )

    df_source1 = pd.DataFrame(
        {
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
            "i1": [10, 11, 12, 13],
        }
    )
    df_enrich1 = pd.DataFrame(
        {
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1],
            "i2": [20, 21, 22, 23],
            "v1": [20, 21, 22, 23],
        }
    )
    store_dataframes_as_dataset(
        dfs=[{"ktk_source": df_source1, "ktk_enrich": df_enrich1}],
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid(cube.seed_dataset),
        metadata_version=KTK_CUBE_METADATA_VERSION,
        secondary_indices=["i1", "i2"],
    )

    df_source2 = pd.DataFrame(
        {
            "x": [10, 11],
            "p": [10, 10],
            "v1": [10.0, 11.0],  # also use another dtype here (was int)
            "i3": [10, 11],
        }
    )
    df_enrich2 = pd.DataFrame(
        {
            "x": [10, 11],
            "p": [10, 10],
            "v1": [20.0, 21.0],  # also use another dtype here (was int)
            "i4": [20, 21],
        }
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source2, "enrich": df_enrich2},
            cube=cube,
            store=function_store,
            overwrite=True,
        )
    cause = exc_info.value.__cause__
    assert str(cause).startswith("Found columns present in multiple datasets:")

    ds_source = DatasetMetadata.load_from_store(
        uuid=cube.ktk_dataset_uuid(cube.seed_dataset), store=function_store()
    ).load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)

    assert len(ds_source.partitions) == 1

    assert ds_source.table_meta["ktk_source"].field("v1").type == pa.int64()
    assert ds_source.table_meta["ktk_enrich"].field("v1").type == pa.int64()
Пример #26
0
def test_append_partitions_no_ts(driver, function_store):
    df_source1 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13],
        "i1": [10, 11, 12, 13],
    })
    df_enrich1 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "v2": [10, 11, 12, 13],
        "i2": [10, 11, 12, 13]
    })
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data={
            "source": df_source1,
            "enrich": df_enrich1
        },
        cube=cube,
        store=function_store,
        metadata={
            "source": {
                "a": 10,
                "b": 11
            },
            "enrich": {
                "a": 20,
                "b": 21
            }
        },
        partition_on={"enrich": []},
    )

    partitions_source_1 = set(
        DatasetMetadata.load_from_store(cube.ktk_dataset_uuid("source"),
                                        function_store()).partitions.keys())
    partitions_enrich_1 = set(
        DatasetMetadata.load_from_store(cube.ktk_dataset_uuid("enrich"),
                                        function_store()).partitions.keys())

    df_source2 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [20, 21, 22, 23],
        "i1": [20, 21, 22, 23],
    })
    df_enrich2 = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "v2": [20, 21, 22, 23],
        "i2": [20, 21, 22, 23]
    })

    result = driver(
        data={
            "source": df_source2,
            "enrich": df_enrich2
        },
        cube=cube,
        store=function_store,
    )

    assert set(result.keys()) == {"source", "enrich"}

    ds_source = result["source"]
    ds_enrich = result["enrich"]

    partitions_source_2 = set(ds_source.partitions.keys())
    partitions_enrich_2 = set(ds_enrich.partitions.keys())

    assert len(partitions_source_2) > len(partitions_source_1)
    assert partitions_source_1.issubset(partitions_source_2)

    assert len(partitions_enrich_2) > len(partitions_enrich_1)
    assert partitions_enrich_1.issubset(partitions_enrich_2)
Пример #27
0
def append_to_cube(
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame],
                List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]], ],
    cube: Cube,
    store: KeyValueStore,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".

    .. hint::

        To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use
        :meth:`remove_partitions` beforehand.

    Parameters
    ----------
    data:
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    metadata:
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)

    existing_datasets = discover_datasets(cube, store)
    partition_on = {k: v.partition_keys for k, v in existing_datasets.items()}

    check_existing_datasets(existing_datasets=existing_datasets,
                            ktk_cube_dataset_ids=set(data.keys()))

    # do all data preparation before writing anything
    # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
    # compat check within 1 dataset
    data = _prepare_data_for_ktk_all(data=data,
                                     cube=cube,
                                     existing_payload=set(),
                                     partition_on=partition_on)

    # update_dataset_from_dataframes requires a store factory, so create one
    # if not provided
    if not callable(store):

        def store_factory():
            return store

    else:
        store_factory = store

    updated_datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        updated_datasets[ktk_cube_dataset_id] = update_dataset_from_dataframes(
            store=store_factory,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            df_list=part,
            partition_on=list(partition_on[ktk_cube_dataset_id]),
            df_serializer=df_serializer or KTK_CUBE_DF_SERIALIZER,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
        )

    return apply_postwrite_checks(
        datasets=updated_datasets,
        cube=cube,
        store=store,
        existing_datasets=existing_datasets,
    )
Пример #28
0
def build_cube(
    data: Union[pd.DataFrame, Dict[str, pd.DataFrame],
                List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]], ],
    cube: Cube,
    store: KeyValueStore,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
    overwrite: bool = False,
    partition_on: Optional[Dict[str, Iterable[str]]] = None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Store given dataframes as Ktk_cube cube.

    ``data`` can be formatted in multiple ways:

    - single DataFrame::

          pd.DataFrame({
              'x': [0, 1, 2, 3],
              'p': [0, 0, 1, 1],
              'v': [42, 45, 20, 10],
          })

      In that case, the seed dataset will be written.

    - dictionary of DataFrames::

          {
              'seed': pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v1': [42, 45, 20, 10],
              }),
              'enrich': pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v2': [False, False, True, False],
              }),
          }

      In that case, multiple datasets can be written at the same time. Note that the seed dataset MUST be included.

    - list of anything above::

          [
              # seed data only
              pd.DataFrame({
                  'x': [0, 1, 2, 3],
                  'p': [0, 0, 1, 1],
                  'v1': [42, 45, 20, 10],
              }),
              # seed data only, explicit way
              {
                  'seed': pd.DataFrame({
                      'x': [4, 5, 6, 7],
                      'p': [0, 0, 1, 1],
                      'v1': [12, 32, 22, 9],
                  }),
              },
              # multiple datasets
              {
                  'seed': pd.DataFrame({
                      'x': [8, 9, 10, 11],
                      'p': [0, 0, 1, 1],
                      'v1': [9, 2, 4, 11],
                  }),
                  'enrich': pd.DataFrame({
                      'x': [8, 9, 10, 11],
                      'p': [0, 0, 1, 1],
                      'v2': [True, True, False, False],
                  }),
              },
              # non-seed data only
              {
                  'enrich': pd.DataFrame({
                      'x': [1, 2, 3, 4],
                      'p': [0, 0, 1, 1],
                      'v2': [False, True, False, False],
                  }),
              },
          ]

      In that case, multiple datasets may be written. Note that at least a single list element must contain seed data.

    Extra metdata may be preserved w/ every dataset, e.g.::

        {
            'seed': {
                'source': 'db',
                'host': 'db1.cluster20.company.net',
                'last_event': '230c6edb-b69a-4d30-b56d-28f5dfe20948',
            },
            'enrich': {
                'source': 'python',
                'commit_hash': '8b5d717518439921e6d17c7495956bdad687bc54',
            },
        }

    Note that the given data must be JSON-serializable.

    If the cube already exists, the ``overwrite`` flag must be given. In that case, all datasets that are part of the
    existing cube must be overwritten. Partial overwrites are not allowed.

    Parameters
    ----------
    data:
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    metadata:
        Metadata for every dataset.
    overwrite:
        If possibly existing datasets should be overwritten.
    partition_on:
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        DatasetMetadata for every dataset written.
    """
    data = _normalize_user_input(data, cube)
    ktk_cube_dataset_ids = set(data.keys())
    prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids,
                                                 partition_on)
    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)

    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
    check_datasets_prebuild(data, cube, existing_datasets)

    # do all data preparation before writing anything
    data = _prepare_data_for_ktk_all(data=data,
                                     cube=cube,
                                     existing_payload=set(),
                                     partition_on=prep_partition_on)

    datasets = {}
    for ktk_cube_dataset_id, part in data.items():
        datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset(
            store=store,
            dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
            dfs=part,
            metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata),
            partition_on=list(prep_partition_on[ktk_cube_dataset_id]),
            metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            df_serializer=df_serializer or KTK_CUBE_DF_SERIALIZER,
            overwrite=overwrite,
        )

    return apply_postwrite_checks(datasets=datasets,
                                  cube=cube,
                                  store=store,
                                  existing_datasets=existing_datasets)
Пример #29
0
def test_overwrite_rollback_ktk_cube(driver, function_store):
    """
    Checks that require a rollback (like overlapping columns) should recover the former state correctly.
    """
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3", "i4"],
    )

    df_source1 = pd.DataFrame(
        {
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
            "i1": [10, 11, 12, 13],
        }
    )
    df_enrich1 = pd.DataFrame(
        {
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1],
            "i2": [20, 21, 22, 23],
            "v2": [20, 21, 22, 23],
        }
    )
    driver(
        data={"source": df_source1, "enrich": df_enrich1},
        cube=cube,
        store=function_store,
    )

    df_source2 = pd.DataFrame(
        {
            "x": [10, 11],
            "p": [10, 10],
            "v1": [10.0, 11.0],  # also use another dtype here (was int)
            "i3": [10, 11],
        }
    )
    df_enrich2 = pd.DataFrame(
        {"x": [10, 11], "p": [10, 10], "v1": [20, 21], "i4": [20, 21]}
    )
    with pytest.raises(MultiTableCommitAborted) as exc_info:
        driver(
            data={"source": df_source2, "enrich": df_enrich2},
            cube=cube,
            store=function_store,
            overwrite=True,
        )
    cause = exc_info.value.__cause__
    assert isinstance(cause, ValueError)
    assert str(cause).startswith("Found columns present in multiple datasets:")

    ds_source = DatasetMetadata.load_from_store(
        uuid=cube.ktk_dataset_uuid("source"), store=function_store()
    ).load_all_indices(function_store())
    ds_enrich = DatasetMetadata.load_from_store(
        uuid=cube.ktk_dataset_uuid("enrich"), store=function_store()
    ).load_all_indices(function_store())

    assert ds_source.uuid == cube.ktk_dataset_uuid(cube.seed_dataset)
    assert ds_enrich.uuid == cube.ktk_dataset_uuid("enrich")

    assert len(ds_source.partitions) == 2
    assert len(ds_enrich.partitions) == 2

    assert set(ds_source.indices.keys()) == {"p", "x", "i1"}
    assert isinstance(ds_source.indices["p"], PartitionIndex)
    assert isinstance(ds_source.indices["x"], ExplicitSecondaryIndex)
    assert set(ds_source.indices["x"].index_dct.keys()) == {0, 1, 2, 3}
    assert set(ds_source.indices["i1"].index_dct.keys()) == {10, 11, 12, 13}

    assert set(ds_enrich.indices.keys()) == {"p", "i2"}
    assert isinstance(ds_enrich.indices["p"], PartitionIndex)
    assert set(ds_enrich.indices["i2"].index_dct.keys()) == {20, 21, 22, 23}

    assert ds_source.table_meta[SINGLE_TABLE].field("v1").type == pa.int64()
def build_cube_from_dataframe(
    data: Union[dd.DataFrame, Dict[str, dd.DataFrame]],
    cube: Cube,
    store: StoreFactory,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
    overwrite: bool = False,
    partition_on: Optional[Dict[str, Iterable[str]]] = None,
    shuffle: bool = False,
    num_buckets: int = 1,
    bucket_by: Optional[Iterable[str]] = None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> Delayed:
    """
    Create dask computation graph that builds a cube with the data supplied from a dask dataframe.

    Parameters
    ----------
    data
        Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed
        dataset.
    cube
        Cube specification.
    store
        Store to which the data should be written to.
    metadata
        Metadata for every dataset.
    overwrite
        If possibly existing datasets should be overwritten.
    partition_on
        Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns).
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.delayed.Delayed
        A dask delayed object containing the compute graph to build a cube returning the dict of dataset metadata
        objects.
    """
    check_store_factory(store)
    if not isinstance(data, dict):
        data = {cube.seed_dataset: data}

    ktk_cube_dataset_ids = sorted(data.keys())

    metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
    existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
    check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets)
    partition_on_checked = prepare_ktk_partition_on(
        cube, ktk_cube_dataset_ids, partition_on
    )
    del partition_on

    dct = {}
    for table_name, ddf in data.items():
        check_user_df(table_name, ddf, cube, set(), partition_on_checked[table_name])

        indices_to_build = set(cube.index_columns) & set(ddf.columns)
        if table_name == cube.seed_dataset:
            indices_to_build |= set(cube.dimension_columns) - cube.suppress_index_on
        indices_to_build -= set(partition_on_checked[table_name])

        ddf = ddf.map_partitions(
            assert_dimesion_index_cols_notnull,
            ktk_cube_dataset_id=table_name,
            cube=cube,
            partition_on=partition_on_checked[table_name],
            meta=ddf._meta,
        )
        graph = store_dataset_from_ddf(
            ddf,
            dataset_uuid=cube.ktk_dataset_uuid(table_name),
            store=store,
            metadata=prepare_ktk_metadata(cube, table_name, metadata),
            partition_on=partition_on_checked[table_name],
            secondary_indices=sorted(indices_to_build),
            sort_partitions_by=sorted(
                (set(cube.dimension_columns) - set(cube.partition_columns))
                & set(ddf.columns)
            ),
            overwrite=overwrite,
            shuffle=shuffle,
            num_buckets=num_buckets,
            bucket_by=bucket_by,
            df_serializer=df_serializer,
        )
        dct[table_name] = graph

    return dask.delayed(apply_postwrite_checks)(
        dct, cube=cube, store=store, existing_datasets=existing_datasets
    )