Exemplo n.º 1
0
def test_missing_metadata(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich": df_enrich
        },
        cube=cube,
        store=function_store,
    )
    store = function_store()
    enrich_keys = {k for k in store.keys() if "cube++enrich" in k}

    store.delete("cube++enrich.by-dataset-metadata.json")

    driver(cube=cube, store=function_store)

    assert not enrich_keys.intersection(store.keys())
Exemplo n.º 2
0
def test_additional_files(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(data=df_seed, cube=cube, store=function_store)

    key_in_ds = cube.ktk_dataset_uuid(cube.seed_dataset) + "/foo"
    key_with_ds_prefix = cube.ktk_dataset_uuid(cube.seed_dataset) + ".foo"
    key_with_cube_prefix = cube.uuid_prefix + ".foo"
    key_with_cube_prefix_separator = cube.uuid_prefix + KTK_CUBE_UUID_SEPARATOR + ".foo"

    function_store().put(key_in_ds, b"")
    function_store().put(key_with_ds_prefix, b"")
    function_store().put(key_with_cube_prefix, b"")
    function_store().put(key_with_cube_prefix_separator, b"")

    driver(cube=cube, store=function_store)
    assert key_in_ds not in set(function_store().keys())
    assert key_with_ds_prefix not in set(function_store().keys())
    assert key_with_cube_prefix in set(function_store().keys())
    assert key_with_cube_prefix_separator not in set(function_store().keys())
Exemplo n.º 3
0
def test_cube_blacklist_dimension_index(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        suppress_index_on=["B"],
    )
    df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["B"].observed_values()

    assert sorted(obs_values) == [1, 2]
Exemplo n.º 4
0
def test_single_rowgroup_when_df_serializer_is_not_passed_to_update_cube(
    driver, function_store
):
    """
    Test that the dataset has a single row group as default path
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df, cube=cube, store=function_store,
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 1, 1: 2, 2: 2}
    part_chunk_size = {0: None, 1: None, 2: None}

    assert len(dataset.partitions) == 3
    assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
Exemplo n.º 5
0
def test_compression_is_compatible_on_update_cube(driver, function_store):
    """
    Test that partitons written with different compression algorithms are compatible

    The compression algorithms are not parametrized because their availability depends
    on the arrow build. 'SNAPPY' and 'GZIP' are already assumed to be available in parts
    of the code. A fully parametrized test would also increase runtime and test complexity
    unnecessarily.
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(compression="SNAPPY"),
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(compression="GZIP"),
    )
    dataset = result["seed"].load_all_indices(function_store())

    assert len(dataset.partitions) == 3
Exemplo n.º 6
0
def test_missing_seed_dataset(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich": df_enrich
        },
        cube=cube,
        store=function_store,
    )
    store = function_store()
    seed_keys = {k for k in store.keys() if "cube++seed" in k and "/" in k}
    enrich_keys = {k for k in store.keys() if "cube++enrich" in k}

    for k in seed_keys:
        store.delete(k)

    driver(cube=cube, store=function_store)

    assert enrich_keys == set(store.keys())
Exemplo n.º 7
0
def test_invalid_partial_copy2(
    df_seed, df_enrich, cube, function_store, function_store2, simple_cube_1, driver
):
    # build a cube that would be incompatible w/ simple_cube_1
    df_seed = df_seed.copy()
    df_enrich = df_enrich.copy()

    df_seed["x"] = df_seed["x"].astype(str)
    df_enrich["x"] = df_enrich["x"].astype(str)
    build_cube(
        data={cube.seed_dataset: df_seed, "enrich2": df_enrich},
        cube=cube,
        store=function_store2,
    )

    keys = set(function_store2().keys())

    # now copy simple_cube_1 over existing cube.
    # this only copies the seed and enrich table since simple_cube_1 does not have an enrich2 table.
    # it should fail because X is incompatible.
    with pytest.raises(ValueError) as exc:
        driver(
            cube=cube,
            src_store=function_store,
            tgt_store=function_store2,
            overwrite=True,
        )
    assert "Found columns present in multiple datasets" in str(exc.value)
    assert keys == set(function_store2().keys())
Exemplo n.º 8
0
def _write_cube(function_store) -> Tuple[pd.DataFrame, Cube]:
    """
    Write a cube with dimension column "x" and partition column "p"

    returns the 'source' and 'enrich' dataframes and the cube specification.
    """
    df_source = pd.DataFrame(
        {
            "i1": [10, 11, 12, 13],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
            "x": [0, 1, 2, 3],
        }
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data={"source": df_source},
        cube=cube,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )
    return df_source, cube
Exemplo n.º 9
0
def test_delayed_index_build_correction_restriction(driver, function_store):
    """
    Ensure that adding extra indices for dimension columns does not mark other datasets as restrictive.
    """
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2]})
    df_extend = pd.DataFrame({"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 1, 2]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["x"],
    )

    results = driver(cube=cube, store=function_store, conditions=C("x") >= 0)
    assert len(results) == 1

    df_actual = results[0]
    df_expected = pd.DataFrame(
        {
            "x": [0, 1, 2, 3, 4, 5],
            "p": [0, 0, 1, 1, 2, 2],
            "v": [0, 1, 2, np.nan, np.nan, np.nan],
        },
        columns=["p", "v", "x"],
    )
    pdt.assert_frame_equal(df_actual, df_expected)
Exemplo n.º 10
0
def test_delayed_index_build_partition_by(driver, function_store):
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]})
    df_extend = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [0, 0, 0, 1]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["v"],
    )

    results = driver(cube=cube, store=function_store, partition_by=["v"])
    assert len(results) == 2

    df_result1 = pd.DataFrame(
        data={"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 0, 0]}, columns=["p", "v", "x"]
    )
    df_result2 = pd.DataFrame(
        data={"x": [3], "p": [1], "v": [1]}, columns=["p", "v", "x"]
    )
    pdt.assert_frame_equal(results[0], df_result1)
    pdt.assert_frame_equal(results[1], df_result2)
Exemplo n.º 11
0
def test_noop(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich": df_enrich
        },
        cube=cube,
        store=function_store,
    )

    keys = set(function_store().keys())

    driver(cube=cube, store=function_store)

    assert set(function_store().keys()) == keys
Exemplo n.º 12
0
def _get_cube(function_store, with_partition_on):
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "q": 0, "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "q": 0, "v2": [10, 11, 12, 13]}
    )
    if with_partition_on:
        df_enrich.drop(columns=["p", "q"], inplace=True)

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p", "q"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data={"source": df_source, "enrich": df_enrich},
        cube=cube,
        store=function_store,
        metadata={"source": {"userkey1": "value1"}},
        partition_on={"enrich": []} if with_partition_on else None,
    )
    return cube
Exemplo n.º 13
0
def test_fail_no_store_factory(driver, function_store, skip_eager):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich": df_enrich
        },
        cube=cube,
        store=function_store,
    )
    store = function_store()
    with pytest.raises(TypeError) as exc:
        driver(cube=cube, store=store, no_run=True)
    assert str(exc.value) == "store must be a factory but is HFilesystemStore"
Exemplo n.º 14
0
def test_stresstest_index_select_row(driver, function_store):
    n_indices = 100
    n_rows = 1000

    data = {"x": np.arange(n_rows), "p": 0}
    for i in range(n_indices):
        data["i{}".format(i)] = np.arange(n_rows)
    df = pd.DataFrame(data)

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        index_columns=["i{}".format(i) for i in range(n_indices)],
    )

    build_cube(data=df, cube=cube, store=function_store)

    conditions = Conjunction([(C("i{}".format(i)) == 0) for i in range(n_indices)])

    result = driver(
        cube=cube,
        store=function_store,
        conditions=conditions,
        payload_columns=["p", "x"],
    )
    assert len(result) == 1
    df_actual = result[0]
    df_expected = df.loc[df["x"] == 0].reindex(columns=["p", "x"])
    pdt.assert_frame_equal(df_actual, df_expected)
Exemplo n.º 15
0
def sparse_outer_opt_cube(
    module_store,
    sparse_outer_data,
    sparse_outer_cube,
    sparse_outer_df,
    sparse_outer_opt_df,
):
    data = {}
    for dataset_id in sparse_outer_data.keys():
        df = sparse_outer_data[dataset_id].copy()

        for col in sparse_outer_opt_df.columns:
            if col in df.columns:
                dtype = sparse_outer_opt_df[col].dtype

                if dtype == np.float64:
                    dtype = np.int64
                elif dtype == np.float32:
                    dtype = np.int32
                elif dtype == np.float16:
                    dtype = np.int16

                df[col] = df[col].astype(dtype)

        data[dataset_id] = df

    cube = sparse_outer_cube.copy(uuid_prefix="sparse_outer_opt_cube")
    build_cube(data=data, store=module_store, cube=cube)
    return cube
Exemplo n.º 16
0
def existing_cube(function_store):
    df_source = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data={
        "source": df_source,
        "enrich": df_enrich
    },
               cube=cube,
               store=function_store)
    return cube
Exemplo n.º 17
0
def test_simple_roundtrip(driver, function_store, function_store_rwro):
    df = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]})
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    build_cube(data=df, cube=cube, store=function_store)
    result = driver(cube=cube, store=function_store_rwro)
    assert len(result) == 1
    df_actual = result[0]
    df_expected = df.reindex(columns=["p", "v", "x"])
    pdt.assert_frame_equal(df_actual, df_expected)
Exemplo n.º 18
0
def massive_partitions_cube(module_store, massive_partitions_data):
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p", "q"],
        uuid_prefix="massive_partitions_cube",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data=massive_partitions_data, store=module_store, cube=cube)
    return cube
Exemplo n.º 19
0
def fullrange_cube(module_store, fullrange_data):
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p", "q"],
        uuid_prefix="fullrange_cube",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data=fullrange_data, store=module_store, cube=cube)
    return cube
Exemplo n.º 20
0
def sparse_outer_cube(module_store, sparse_outer_data):
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p", "q"],
        uuid_prefix="sparse_outer_cube",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data=sparse_outer_data, store=module_store, cube=cube)
    return cube
Exemplo n.º 21
0
def test_wrong_condition_type(driver, function_store, driver_name):
    types = {
        "int": pd.Series([-1], dtype=np.int64),
        "uint": pd.Series([1], dtype=np.uint64),
        "float": pd.Series([1.3], dtype=np.float64),
        "bool": pd.Series([True], dtype=np.bool_),
        "str": pd.Series(["foo"], dtype=object),
    }
    cube = Cube(
        dimension_columns=["d_{}".format(t) for t in sorted(types.keys())],
        partition_columns=["p_{}".format(t) for t in sorted(types.keys())],
        uuid_prefix="typed_cube",
        index_columns=["i_{}".format(t) for t in sorted(types.keys())],
    )
    data = {
        "seed": pd.DataFrame(
            {
                "{}_{}".format(prefix, t): types[t]
                for t in sorted(types.keys())
                for prefix in ["d", "p", "v1"]
            }
        ),
        "enrich": pd.DataFrame(
            {
                "{}_{}".format(prefix, t): types[t]
                for t in sorted(types.keys())
                for prefix in ["d", "p", "i", "v2"]
            }
        ),
    }
    build_cube(data=data, store=function_store, cube=cube)

    df = pd.DataFrame(
        {
            "{}_{}".format(prefix, t): types[t]
            for t in sorted(types.keys())
            for prefix in ["d", "p", "i", "v1", "v2"]
        }
    )

    for col in df.columns:
        t1 = col.split("_")[1]

        for t2 in sorted(types.keys()):
            cond = C(col) == types[t2].values[0]

            if t1 == t2:
                result = driver(cube=cube, store=function_store, conditions=cond)
                assert len(result) == 1
                df_actual = result[0]
                df_expected = cond.filter_df(df).reset_index(drop=True)
                pdt.assert_frame_equal(df_actual, df_expected, check_like=True)
            else:
                with pytest.raises(TypeError) as exc:
                    driver(cube=cube, store=function_store, conditions=cond)
                assert "has wrong type" in str(exc.value)
Exemplo n.º 22
0
def test_condition_on_null(driver, function_store):
    df = pd.DataFrame({
        "x":
        pd.Series([0, 1, 2], dtype=np.int64),
        "p":
        pd.Series([0, 0, 1], dtype=np.int64),
        "v_f1":
        pd.Series([0, np.nan, 2], dtype=np.float64),
        "v_f2":
        pd.Series([0, 1, np.nan], dtype=np.float64),
        "v_f3":
        pd.Series([np.nan, np.nan, np.nan], dtype=np.float64),
        "v_s1":
        pd.Series(["a", None, "c"], dtype=object),
        "v_s2":
        pd.Series(["a", "b", None], dtype=object),
        "v_s3":
        pd.Series([None, None, None], dtype=object),
    })
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="nulled_cube",
        index_columns=[],
    )
    build_cube(data=df, store=function_store, cube=cube)

    for col in df.columns:
        # only iterate over the value columns (not the dimension / partition column):
        if not col.startswith("v"):
            continue

        # col_type will be either 'f' for float or 's' for string; see column
        # names above
        col_type = col.split("_")[1][0]
        if col_type == "f":
            value = 1.2
        elif col_type == "s":
            value = "foo"
        else:
            raise RuntimeError("unknown type")

        cond = C(col) == value

        df_expected = cond.filter_df(df).reset_index(drop=True)

        result = driver(cube=cube, store=function_store, conditions=cond)

        if df_expected.empty:
            assert len(result) == 0
        else:
            assert len(result) == 1
            df_actual = result[0]
            pdt.assert_frame_equal(df_actual, df_expected, check_like=True)
Exemplo n.º 23
0
def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube(
        driver, function_store, chunk_size_build, chunk_size_append):
    """
    Test that the dataset is split into row groups depending on the chunk size

    Partitions build with ``chunk_size=None`` should keep a single row group after the append. Partitions that are newly created with
    ``chunk_size>0`` should be split into row groups accordingly.
    """

    # Build cube
    df = pd.DataFrame(
        data={
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1]
        },
        columns=["x", "p"],
    )
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_build),
    )

    # Append to cube
    df_append = pd.DataFrame(
        data={
            "x": [0, 1, 2, 3],
            "p": [2, 3, 3, 3]
        },
        columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_append},
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_append),
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3}
    part_chunk_size = {
        0: chunk_size_build,
        1: chunk_size_build,
        2: chunk_size_append,
        3: chunk_size_append,
    }

    assert len(dataset.partitions) == 4
    assert_num_row_groups(function_store(), dataset, part_num_rows,
                          part_chunk_size)
Exemplo n.º 24
0
def built_cube(store, cube, df_source, df_enrich):
    with freeze_time(datetime(2018, 1, 31, 14, 3, 22)):
        build_cube(data={"source": df_source}, cube=cube, store=store)

    with freeze_time(datetime(2019, 2, 28, 13, 1, 17)):
        extend_cube(
            data={"enrich": df_enrich},
            cube=cube,
            store=store,
            partition_on={"enrich": ["part", "q"]},
        )
    return cube
Exemplo n.º 25
0
def test_cube_update_secondary_indices_subset(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["indexed"],
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "indexed": 1, "not-indexed": 1})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_2 = pd.DataFrame({"A": range(10, 20), "P": 1, "indexed": 2, "not-indexed": 1})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["indexed"].observed_values()

    assert sorted(obs_values) == [1, 2]

    cube2 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["not-indexed"],
    )
    with pytest.raises(
        ValueError,
        match='ExplicitSecondaryIndex or PartitionIndex "not-indexed" is missing in dataset',
    ):
        driver(
            data={"source": df_2},
            cube=cube2,
            store=function_store,
            remove_conditions=None,
        )
Exemplo n.º 26
0
def other_part_cube(module_store, data_no_part):
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p", "q"],
        uuid_prefix="other_part_cube",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data=data_no_part,
        store=module_store,
        cube=cube,
        partition_on={"enrich_dense": ["i2"], "enrich_sparse": ["i3"]},
    )
    return cube
Exemplo n.º 27
0
def test_delete_twice(driver, function_store):
    df = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(data=df, cube=cube, store=function_store)
    driver(cube=cube, store=function_store)
    driver(cube=cube, store=function_store)

    assert set(function_store().keys()) == set()
Exemplo n.º 28
0
def test_single_rowgroup_when_df_serializer_is_not_passed_to_append_cube(
        driver, function_store):
    """
    Test that the dataset has a single row group as default path
    """

    # Build cube
    df = pd.DataFrame(
        data={
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1]
        },
        columns=["x", "p"],
    )
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
    )

    # Append to cube
    df_append = pd.DataFrame(
        data={
            "x": [0, 1, 2, 3],
            "p": [2, 3, 3, 3]
        },
        columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_append},
        cube=cube,
        store=function_store,
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3}
    part_chunk_size = {0: None, 1: None, 2: None, 3: None}

    assert len(dataset.partitions) == 4
    assert_num_row_groups(function_store(), dataset, part_num_rows,
                          part_chunk_size)
Exemplo n.º 29
0
def test_keep_other(driver, function_store):
    df = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v": [10, 11, 12, 13]
    })
    cube1 = Cube(dimension_columns=["x"],
                 partition_columns=["p"],
                 uuid_prefix="cube1")
    cube2 = cube1.copy(uuid_prefix="cube2")

    build_cube(data=df, cube=cube1, store=function_store)
    keys = set(function_store().keys())

    build_cube(data=df, cube=cube2, store=function_store)

    driver(cube=cube2, store=function_store)

    assert set(function_store().keys()) == keys
Exemplo n.º 30
0
def test_rowgroups_are_applied_when_df_serializer_is_passed_to_update_cube(
    driver, function_store, chunk_size_build, chunk_size_update
):
    """
    Test that the dataset is split into row groups depending on the chunk size

    Partitions build with ``chunk_size=None`` should keep a single row group if they
    are not touched by the update. Partitions that are newly created or replaced with
    ``chunk_size>0`` should be split into row groups accordingly.
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_build),
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_update),
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 1, 1: 2, 2: 2}
    part_chunk_size = {
        0: chunk_size_build,
        1: chunk_size_update,
        2: chunk_size_update,
    }

    assert len(dataset.partitions) == 3
    assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)