Пример #1
0
def test_missing_metadata(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich": df_enrich
        },
        cube=cube,
        store=function_store,
    )
    store = function_store()
    enrich_keys = {k for k in store.keys() if "cube++enrich" in k}

    store.delete("cube++enrich.by-dataset-metadata.json")

    driver(cube=cube, store=function_store)

    assert not enrich_keys.intersection(store.keys())
Пример #2
0
def test_additional_files(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(data=df_seed, cube=cube, store=function_store)

    key_in_ds = cube.ktk_dataset_uuid(cube.seed_dataset) + "/foo"
    key_with_ds_prefix = cube.ktk_dataset_uuid(cube.seed_dataset) + ".foo"
    key_with_cube_prefix = cube.uuid_prefix + ".foo"
    key_with_cube_prefix_separator = cube.uuid_prefix + KTK_CUBE_UUID_SEPARATOR + ".foo"

    function_store().put(key_in_ds, b"")
    function_store().put(key_with_ds_prefix, b"")
    function_store().put(key_with_cube_prefix, b"")
    function_store().put(key_with_cube_prefix_separator, b"")

    driver(cube=cube, store=function_store)
    assert key_in_ds not in set(function_store().keys())
    assert key_with_ds_prefix not in set(function_store().keys())
    assert key_with_cube_prefix in set(function_store().keys())
    assert key_with_cube_prefix_separator not in set(function_store().keys())
Пример #3
0
def test_cube_blacklist_dimension_index(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A", "B"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        suppress_index_on=["B"],
    )
    df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["B"].observed_values()

    assert sorted(obs_values) == [1, 2]
Пример #4
0
def test_single_rowgroup_when_df_serializer_is_not_passed_to_update_cube(
    driver, function_store
):
    """
    Test that the dataset has a single row group as default path
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df, cube=cube, store=function_store,
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 1, 1: 2, 2: 2}
    part_chunk_size = {0: None, 1: None, 2: None}

    assert len(dataset.partitions) == 3
    assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
Пример #5
0
def test_compression_is_compatible_on_update_cube(driver, function_store):
    """
    Test that partitons written with different compression algorithms are compatible

    The compression algorithms are not parametrized because their availability depends
    on the arrow build. 'SNAPPY' and 'GZIP' are already assumed to be available in parts
    of the code. A fully parametrized test would also increase runtime and test complexity
    unnecessarily.
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(compression="SNAPPY"),
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(compression="GZIP"),
    )
    dataset = result["seed"].load_all_indices(function_store())

    assert len(dataset.partitions) == 3
Пример #6
0
def test_missing_seed_dataset(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich": df_enrich
        },
        cube=cube,
        store=function_store,
    )
    store = function_store()
    seed_keys = {k for k in store.keys() if "cube++seed" in k and "/" in k}
    enrich_keys = {k for k in store.keys() if "cube++enrich" in k}

    for k in seed_keys:
        store.delete(k)

    driver(cube=cube, store=function_store)

    assert enrich_keys == set(store.keys())
Пример #7
0
def test_invalid_partial_copy2(
    df_seed, df_enrich, cube, function_store, function_store2, simple_cube_1, driver
):
    # build a cube that would be incompatible w/ simple_cube_1
    df_seed = df_seed.copy()
    df_enrich = df_enrich.copy()

    df_seed["x"] = df_seed["x"].astype(str)
    df_enrich["x"] = df_enrich["x"].astype(str)
    build_cube(
        data={cube.seed_dataset: df_seed, "enrich2": df_enrich},
        cube=cube,
        store=function_store2,
    )

    keys = set(function_store2().keys())

    # now copy simple_cube_1 over existing cube.
    # this only copies the seed and enrich table since simple_cube_1 does not have an enrich2 table.
    # it should fail because X is incompatible.
    with pytest.raises(ValueError) as exc:
        driver(
            cube=cube,
            src_store=function_store,
            tgt_store=function_store2,
            overwrite=True,
        )
    assert "Found columns present in multiple datasets" in str(exc.value)
    assert keys == set(function_store2().keys())
Пример #8
0
def _write_cube(function_store) -> Tuple[pd.DataFrame, Cube]:
    """
    Write a cube with dimension column "x" and partition column "p"

    returns the 'source' and 'enrich' dataframes and the cube specification.
    """
    df_source = pd.DataFrame(
        {
            "i1": [10, 11, 12, 13],
            "p": [0, 0, 1, 1],
            "v1": [10, 11, 12, 13],
            "x": [0, 1, 2, 3],
        }
    )
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data={"source": df_source},
        cube=cube,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )
    return df_source, cube
Пример #9
0
def test_delayed_index_build_correction_restriction(driver, function_store):
    """
    Ensure that adding extra indices for dimension columns does not mark other datasets as restrictive.
    """
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2]})
    df_extend = pd.DataFrame({"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 1, 2]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["x"],
    )

    results = driver(cube=cube, store=function_store, conditions=C("x") >= 0)
    assert len(results) == 1

    df_actual = results[0]
    df_expected = pd.DataFrame(
        {
            "x": [0, 1, 2, 3, 4, 5],
            "p": [0, 0, 1, 1, 2, 2],
            "v": [0, 1, 2, np.nan, np.nan, np.nan],
        },
        columns=["p", "v", "x"],
    )
    pdt.assert_frame_equal(df_actual, df_expected)
Пример #10
0
def test_delayed_index_build_partition_by(driver, function_store):
    df_seed = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]})
    df_extend = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [0, 0, 0, 1]})
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="delayed_index_cube",
        index_columns=[],
    )
    build_cube(
        data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube
    )

    build_dataset_indices(
        store=function_store,
        dataset_uuid=cube.ktk_dataset_uuid("extend"),
        columns=["v"],
    )

    results = driver(cube=cube, store=function_store, partition_by=["v"])
    assert len(results) == 2

    df_result1 = pd.DataFrame(
        data={"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 0, 0]}, columns=["p", "v", "x"]
    )
    df_result2 = pd.DataFrame(
        data={"x": [3], "p": [1], "v": [1]}, columns=["p", "v", "x"]
    )
    pdt.assert_frame_equal(results[0], df_result1)
    pdt.assert_frame_equal(results[1], df_result2)
Пример #11
0
def test_noop(driver, function_store):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich": df_enrich
        },
        cube=cube,
        store=function_store,
    )

    keys = set(function_store().keys())

    driver(cube=cube, store=function_store)

    assert set(function_store().keys()) == keys
Пример #12
0
def _get_cube(function_store, with_partition_on):
    df_source = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "q": 0, "v1": [10, 11, 12, 13]}
    )
    df_enrich = pd.DataFrame(
        {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "q": 0, "v2": [10, 11, 12, 13]}
    )
    if with_partition_on:
        df_enrich.drop(columns=["p", "q"], inplace=True)

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p", "q"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data={"source": df_source, "enrich": df_enrich},
        cube=cube,
        store=function_store,
        metadata={"source": {"userkey1": "value1"}},
        partition_on={"enrich": []} if with_partition_on else None,
    )
    return cube
Пример #13
0
def test_fail_no_store_factory(driver, function_store, skip_eager):
    df_seed = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(
        data={
            cube.seed_dataset: df_seed,
            "enrich": df_enrich
        },
        cube=cube,
        store=function_store,
    )
    store = function_store()
    with pytest.raises(TypeError) as exc:
        driver(cube=cube, store=store, no_run=True)
    assert str(exc.value) == "store must be a factory but is HFilesystemStore"
Пример #14
0
def test_stresstest_index_select_row(driver, function_store):
    n_indices = 100
    n_rows = 1000

    data = {"x": np.arange(n_rows), "p": 0}
    for i in range(n_indices):
        data["i{}".format(i)] = np.arange(n_rows)
    df = pd.DataFrame(data)

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        index_columns=["i{}".format(i) for i in range(n_indices)],
    )

    build_cube(data=df, cube=cube, store=function_store)

    conditions = Conjunction([(C("i{}".format(i)) == 0) for i in range(n_indices)])

    result = driver(
        cube=cube,
        store=function_store,
        conditions=conditions,
        payload_columns=["p", "x"],
    )
    assert len(result) == 1
    df_actual = result[0]
    df_expected = df.loc[df["x"] == 0].reindex(columns=["p", "x"])
    pdt.assert_frame_equal(df_actual, df_expected)
Пример #15
0
def sparse_outer_opt_cube(
    module_store,
    sparse_outer_data,
    sparse_outer_cube,
    sparse_outer_df,
    sparse_outer_opt_df,
):
    data = {}
    for dataset_id in sparse_outer_data.keys():
        df = sparse_outer_data[dataset_id].copy()

        for col in sparse_outer_opt_df.columns:
            if col in df.columns:
                dtype = sparse_outer_opt_df[col].dtype

                if dtype == np.float64:
                    dtype = np.int64
                elif dtype == np.float32:
                    dtype = np.int32
                elif dtype == np.float16:
                    dtype = np.int16

                df[col] = df[col].astype(dtype)

        data[dataset_id] = df

    cube = sparse_outer_cube.copy(uuid_prefix="sparse_outer_opt_cube")
    build_cube(data=data, store=module_store, cube=cube)
    return cube
Пример #16
0
def existing_cube(function_store):
    df_source = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v1": [10, 11, 12, 13]
    })
    df_enrich = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v2": [10, 11, 12, 13]
    })
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data={
        "source": df_source,
        "enrich": df_enrich
    },
               cube=cube,
               store=function_store)
    return cube
Пример #17
0
def test_simple_roundtrip(driver, function_store, function_store_rwro):
    df = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]})
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube")
    build_cube(data=df, cube=cube, store=function_store)
    result = driver(cube=cube, store=function_store_rwro)
    assert len(result) == 1
    df_actual = result[0]
    df_expected = df.reindex(columns=["p", "v", "x"])
    pdt.assert_frame_equal(df_actual, df_expected)
Пример #18
0
def massive_partitions_cube(module_store, massive_partitions_data):
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p", "q"],
        uuid_prefix="massive_partitions_cube",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data=massive_partitions_data, store=module_store, cube=cube)
    return cube
Пример #19
0
def fullrange_cube(module_store, fullrange_data):
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p", "q"],
        uuid_prefix="fullrange_cube",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data=fullrange_data, store=module_store, cube=cube)
    return cube
Пример #20
0
def sparse_outer_cube(module_store, sparse_outer_data):
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p", "q"],
        uuid_prefix="sparse_outer_cube",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(data=sparse_outer_data, store=module_store, cube=cube)
    return cube
Пример #21
0
def test_wrong_condition_type(driver, function_store, driver_name):
    types = {
        "int": pd.Series([-1], dtype=np.int64),
        "uint": pd.Series([1], dtype=np.uint64),
        "float": pd.Series([1.3], dtype=np.float64),
        "bool": pd.Series([True], dtype=np.bool_),
        "str": pd.Series(["foo"], dtype=object),
    }
    cube = Cube(
        dimension_columns=["d_{}".format(t) for t in sorted(types.keys())],
        partition_columns=["p_{}".format(t) for t in sorted(types.keys())],
        uuid_prefix="typed_cube",
        index_columns=["i_{}".format(t) for t in sorted(types.keys())],
    )
    data = {
        "seed": pd.DataFrame(
            {
                "{}_{}".format(prefix, t): types[t]
                for t in sorted(types.keys())
                for prefix in ["d", "p", "v1"]
            }
        ),
        "enrich": pd.DataFrame(
            {
                "{}_{}".format(prefix, t): types[t]
                for t in sorted(types.keys())
                for prefix in ["d", "p", "i", "v2"]
            }
        ),
    }
    build_cube(data=data, store=function_store, cube=cube)

    df = pd.DataFrame(
        {
            "{}_{}".format(prefix, t): types[t]
            for t in sorted(types.keys())
            for prefix in ["d", "p", "i", "v1", "v2"]
        }
    )

    for col in df.columns:
        t1 = col.split("_")[1]

        for t2 in sorted(types.keys()):
            cond = C(col) == types[t2].values[0]

            if t1 == t2:
                result = driver(cube=cube, store=function_store, conditions=cond)
                assert len(result) == 1
                df_actual = result[0]
                df_expected = cond.filter_df(df).reset_index(drop=True)
                pdt.assert_frame_equal(df_actual, df_expected, check_like=True)
            else:
                with pytest.raises(TypeError) as exc:
                    driver(cube=cube, store=function_store, conditions=cond)
                assert "has wrong type" in str(exc.value)
Пример #22
0
def test_condition_on_null(driver, function_store):
    df = pd.DataFrame({
        "x":
        pd.Series([0, 1, 2], dtype=np.int64),
        "p":
        pd.Series([0, 0, 1], dtype=np.int64),
        "v_f1":
        pd.Series([0, np.nan, 2], dtype=np.float64),
        "v_f2":
        pd.Series([0, 1, np.nan], dtype=np.float64),
        "v_f3":
        pd.Series([np.nan, np.nan, np.nan], dtype=np.float64),
        "v_s1":
        pd.Series(["a", None, "c"], dtype=object),
        "v_s2":
        pd.Series(["a", "b", None], dtype=object),
        "v_s3":
        pd.Series([None, None, None], dtype=object),
    })
    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="nulled_cube",
        index_columns=[],
    )
    build_cube(data=df, store=function_store, cube=cube)

    for col in df.columns:
        # only iterate over the value columns (not the dimension / partition column):
        if not col.startswith("v"):
            continue

        # col_type will be either 'f' for float or 's' for string; see column
        # names above
        col_type = col.split("_")[1][0]
        if col_type == "f":
            value = 1.2
        elif col_type == "s":
            value = "foo"
        else:
            raise RuntimeError("unknown type")

        cond = C(col) == value

        df_expected = cond.filter_df(df).reset_index(drop=True)

        result = driver(cube=cube, store=function_store, conditions=cond)

        if df_expected.empty:
            assert len(result) == 0
        else:
            assert len(result) == 1
            df_actual = result[0]
            pdt.assert_frame_equal(df_actual, df_expected, check_like=True)
Пример #23
0
def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube(
        driver, function_store, chunk_size_build, chunk_size_append):
    """
    Test that the dataset is split into row groups depending on the chunk size

    Partitions build with ``chunk_size=None`` should keep a single row group after the append. Partitions that are newly created with
    ``chunk_size>0`` should be split into row groups accordingly.
    """

    # Build cube
    df = pd.DataFrame(
        data={
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1]
        },
        columns=["x", "p"],
    )
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_build),
    )

    # Append to cube
    df_append = pd.DataFrame(
        data={
            "x": [0, 1, 2, 3],
            "p": [2, 3, 3, 3]
        },
        columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_append},
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_append),
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3}
    part_chunk_size = {
        0: chunk_size_build,
        1: chunk_size_build,
        2: chunk_size_append,
        3: chunk_size_append,
    }

    assert len(dataset.partitions) == 4
    assert_num_row_groups(function_store(), dataset, part_num_rows,
                          part_chunk_size)
Пример #24
0
def built_cube(store, cube, df_source, df_enrich):
    with freeze_time(datetime(2018, 1, 31, 14, 3, 22)):
        build_cube(data={"source": df_source}, cube=cube, store=store)

    with freeze_time(datetime(2019, 2, 28, 13, 1, 17)):
        extend_cube(
            data={"enrich": df_enrich},
            cube=cube,
            store=store,
            partition_on={"enrich": ["part", "q"]},
        )
    return cube
Пример #25
0
def test_cube_update_secondary_indices_subset(function_store, driver):

    cube1 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["indexed"],
    )
    df_1 = pd.DataFrame({"A": range(10), "P": 1, "indexed": 1, "not-indexed": 1})
    build_cube(
        data={"source": df_1},
        cube=cube1,
        store=function_store,
        metadata={"source": {"meta_at_create": "data"}},
    )

    cube2 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
    )
    df_2 = pd.DataFrame({"A": range(10, 20), "P": 1, "indexed": 2, "not-indexed": 1})
    driver(
        data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None
    )

    dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset)
    dm = DatasetMetadata.load_from_store(
        dataset_uuid, function_store(), load_all_indices=True
    )
    obs_values = dm.indices["indexed"].observed_values()

    assert sorted(obs_values) == [1, 2]

    cube2 = Cube(
        dimension_columns=["A"],
        partition_columns=["P"],
        uuid_prefix="cube",
        seed_dataset="source",
        index_columns=["not-indexed"],
    )
    with pytest.raises(
        ValueError,
        match='ExplicitSecondaryIndex or PartitionIndex "not-indexed" is missing in dataset',
    ):
        driver(
            data={"source": df_2},
            cube=cube2,
            store=function_store,
            remove_conditions=None,
        )
Пример #26
0
def other_part_cube(module_store, data_no_part):
    cube = Cube(
        dimension_columns=["x", "y", "z"],
        partition_columns=["p", "q"],
        uuid_prefix="other_part_cube",
        index_columns=["i1", "i2", "i3"],
    )
    build_cube(
        data=data_no_part,
        store=module_store,
        cube=cube,
        partition_on={"enrich_dense": ["i2"], "enrich_sparse": ["i3"]},
    )
    return cube
Пример #27
0
def test_delete_twice(driver, function_store):
    df = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v": [10, 11, 12, 13]
    })
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="cube")
    build_cube(data=df, cube=cube, store=function_store)
    driver(cube=cube, store=function_store)
    driver(cube=cube, store=function_store)

    assert set(function_store().keys()) == set()
Пример #28
0
def test_single_rowgroup_when_df_serializer_is_not_passed_to_append_cube(
        driver, function_store):
    """
    Test that the dataset has a single row group as default path
    """

    # Build cube
    df = pd.DataFrame(
        data={
            "x": [0, 1, 2, 3],
            "p": [0, 0, 1, 1]
        },
        columns=["x", "p"],
    )
    cube = Cube(dimension_columns=["x"],
                partition_columns=["p"],
                uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
    )

    # Append to cube
    df_append = pd.DataFrame(
        data={
            "x": [0, 1, 2, 3],
            "p": [2, 3, 3, 3]
        },
        columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_append},
        cube=cube,
        store=function_store,
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3}
    part_chunk_size = {0: None, 1: None, 2: None, 3: None}

    assert len(dataset.partitions) == 4
    assert_num_row_groups(function_store(), dataset, part_num_rows,
                          part_chunk_size)
Пример #29
0
def test_keep_other(driver, function_store):
    df = pd.DataFrame({
        "x": [0, 1, 2, 3],
        "p": [0, 0, 1, 1],
        "v": [10, 11, 12, 13]
    })
    cube1 = Cube(dimension_columns=["x"],
                 partition_columns=["p"],
                 uuid_prefix="cube1")
    cube2 = cube1.copy(uuid_prefix="cube2")

    build_cube(data=df, cube=cube1, store=function_store)
    keys = set(function_store().keys())

    build_cube(data=df, cube=cube2, store=function_store)

    driver(cube=cube2, store=function_store)

    assert set(function_store().keys()) == keys
Пример #30
0
def test_rowgroups_are_applied_when_df_serializer_is_passed_to_update_cube(
    driver, function_store, chunk_size_build, chunk_size_update
):
    """
    Test that the dataset is split into row groups depending on the chunk size

    Partitions build with ``chunk_size=None`` should keep a single row group if they
    are not touched by the update. Partitions that are newly created or replaced with
    ``chunk_size>0`` should be split into row groups accordingly.
    """
    # Build cube
    df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],)
    cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube")
    build_cube(
        data=df,
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_build),
    )

    # Update cube - replace p=1 and append p=2 partitions
    df_update = pd.DataFrame(
        data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"],
    )
    result = driver(
        data={"seed": df_update},
        remove_conditions=(C("p") == 1),  # Remove p=1 partition
        cube=cube,
        store=function_store,
        df_serializer=ParquetSerializer(chunk_size=chunk_size_update),
    )
    dataset = result["seed"].load_all_indices(function_store())

    part_num_rows = {0: 1, 1: 2, 2: 2}
    part_chunk_size = {
        0: chunk_size_build,
        1: chunk_size_update,
        2: chunk_size_update,
    }

    assert len(dataset.partitions) == 3
    assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)