def test_missing_metadata(driver, function_store): df_seed = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13] }) df_enrich = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [10, 11, 12, 13] }) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube( data={ cube.seed_dataset: df_seed, "enrich": df_enrich }, cube=cube, store=function_store, ) store = function_store() enrich_keys = {k for k in store.keys() if "cube++enrich" in k} store.delete("cube++enrich.by-dataset-metadata.json") driver(cube=cube, store=function_store) assert not enrich_keys.intersection(store.keys())
def test_additional_files(driver, function_store): df_seed = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13] }) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube(data=df_seed, cube=cube, store=function_store) key_in_ds = cube.ktk_dataset_uuid(cube.seed_dataset) + "/foo" key_with_ds_prefix = cube.ktk_dataset_uuid(cube.seed_dataset) + ".foo" key_with_cube_prefix = cube.uuid_prefix + ".foo" key_with_cube_prefix_separator = cube.uuid_prefix + KTK_CUBE_UUID_SEPARATOR + ".foo" function_store().put(key_in_ds, b"") function_store().put(key_with_ds_prefix, b"") function_store().put(key_with_cube_prefix, b"") function_store().put(key_with_cube_prefix_separator, b"") driver(cube=cube, store=function_store) assert key_in_ds not in set(function_store().keys()) assert key_with_ds_prefix not in set(function_store().keys()) assert key_with_cube_prefix in set(function_store().keys()) assert key_with_cube_prefix_separator not in set(function_store().keys())
def test_cube_blacklist_dimension_index(function_store, driver): cube1 = Cube( dimension_columns=["A", "B"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", ) df_1 = pd.DataFrame({"A": range(10), "P": 1, "B": 1, "payload": ""}) build_cube( data={"source": df_1}, cube=cube1, store=function_store, metadata={"source": {"meta_at_create": "data"}}, ) cube2 = Cube( dimension_columns=["A", "B"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", suppress_index_on=["B"], ) df_2 = pd.DataFrame({"A": range(10), "P": 1, "B": 2, "payload": ""}) driver( data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None ) dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset) dm = DatasetMetadata.load_from_store( dataset_uuid, function_store(), load_all_indices=True ) obs_values = dm.indices["B"].observed_values() assert sorted(obs_values) == [1, 2]
def test_single_rowgroup_when_df_serializer_is_not_passed_to_update_cube( driver, function_store ): """ Test that the dataset has a single row group as default path """ # Build cube df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, cube=cube, store=function_store, ) # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], ) result = driver( data={"seed": df_update}, remove_conditions=(C("p") == 1), # Remove p=1 partition cube=cube, store=function_store, ) dataset = result["seed"].load_all_indices(function_store()) part_num_rows = {0: 1, 1: 2, 2: 2} part_chunk_size = {0: None, 1: None, 2: None} assert len(dataset.partitions) == 3 assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
def test_compression_is_compatible_on_update_cube(driver, function_store): """ Test that partitons written with different compression algorithms are compatible The compression algorithms are not parametrized because their availability depends on the arrow build. 'SNAPPY' and 'GZIP' are already assumed to be available in parts of the code. A fully parametrized test would also increase runtime and test complexity unnecessarily. """ # Build cube df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, cube=cube, store=function_store, df_serializer=ParquetSerializer(compression="SNAPPY"), ) # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], ) result = driver( data={"seed": df_update}, remove_conditions=(C("p") == 1), # Remove p=1 partition cube=cube, store=function_store, df_serializer=ParquetSerializer(compression="GZIP"), ) dataset = result["seed"].load_all_indices(function_store()) assert len(dataset.partitions) == 3
def test_missing_seed_dataset(driver, function_store): df_seed = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13] }) df_enrich = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [10, 11, 12, 13] }) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube( data={ cube.seed_dataset: df_seed, "enrich": df_enrich }, cube=cube, store=function_store, ) store = function_store() seed_keys = {k for k in store.keys() if "cube++seed" in k and "/" in k} enrich_keys = {k for k in store.keys() if "cube++enrich" in k} for k in seed_keys: store.delete(k) driver(cube=cube, store=function_store) assert enrich_keys == set(store.keys())
def test_invalid_partial_copy2( df_seed, df_enrich, cube, function_store, function_store2, simple_cube_1, driver ): # build a cube that would be incompatible w/ simple_cube_1 df_seed = df_seed.copy() df_enrich = df_enrich.copy() df_seed["x"] = df_seed["x"].astype(str) df_enrich["x"] = df_enrich["x"].astype(str) build_cube( data={cube.seed_dataset: df_seed, "enrich2": df_enrich}, cube=cube, store=function_store2, ) keys = set(function_store2().keys()) # now copy simple_cube_1 over existing cube. # this only copies the seed and enrich table since simple_cube_1 does not have an enrich2 table. # it should fail because X is incompatible. with pytest.raises(ValueError) as exc: driver( cube=cube, src_store=function_store, tgt_store=function_store2, overwrite=True, ) assert "Found columns present in multiple datasets" in str(exc.value) assert keys == set(function_store2().keys())
def _write_cube(function_store) -> Tuple[pd.DataFrame, Cube]: """ Write a cube with dimension column "x" and partition column "p" returns the 'source' and 'enrich' dataframes and the cube specification. """ df_source = pd.DataFrame( { "i1": [10, 11, 12, 13], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13], "x": [0, 1, 2, 3], } ) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i1", "i2", "i3"], ) build_cube( data={"source": df_source}, cube=cube, store=function_store, metadata={"source": {"meta_at_create": "data"}}, ) return df_source, cube
def test_delayed_index_build_correction_restriction(driver, function_store): """ Ensure that adding extra indices for dimension columns does not mark other datasets as restrictive. """ df_seed = pd.DataFrame({"x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2]}) df_extend = pd.DataFrame({"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 1, 2]}) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="delayed_index_cube", index_columns=[], ) build_cube( data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube ) build_dataset_indices( store=function_store, dataset_uuid=cube.ktk_dataset_uuid("extend"), columns=["x"], ) results = driver(cube=cube, store=function_store, conditions=C("x") >= 0) assert len(results) == 1 df_actual = results[0] df_expected = pd.DataFrame( { "x": [0, 1, 2, 3, 4, 5], "p": [0, 0, 1, 1, 2, 2], "v": [0, 1, 2, np.nan, np.nan, np.nan], }, columns=["p", "v", "x"], ) pdt.assert_frame_equal(df_actual, df_expected)
def test_delayed_index_build_partition_by(driver, function_store): df_seed = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1]}) df_extend = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [0, 0, 0, 1]}) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="delayed_index_cube", index_columns=[], ) build_cube( data={"seed": df_seed, "extend": df_extend}, store=function_store, cube=cube ) build_dataset_indices( store=function_store, dataset_uuid=cube.ktk_dataset_uuid("extend"), columns=["v"], ) results = driver(cube=cube, store=function_store, partition_by=["v"]) assert len(results) == 2 df_result1 = pd.DataFrame( data={"x": [0, 1, 2], "p": [0, 0, 1], "v": [0, 0, 0]}, columns=["p", "v", "x"] ) df_result2 = pd.DataFrame( data={"x": [3], "p": [1], "v": [1]}, columns=["p", "v", "x"] ) pdt.assert_frame_equal(results[0], df_result1) pdt.assert_frame_equal(results[1], df_result2)
def test_noop(driver, function_store): df_seed = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13] }) df_enrich = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [10, 11, 12, 13] }) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube( data={ cube.seed_dataset: df_seed, "enrich": df_enrich }, cube=cube, store=function_store, ) keys = set(function_store().keys()) driver(cube=cube, store=function_store) assert set(function_store().keys()) == keys
def _get_cube(function_store, with_partition_on): df_source = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "q": 0, "v1": [10, 11, 12, 13]} ) df_enrich = pd.DataFrame( {"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "q": 0, "v2": [10, 11, 12, 13]} ) if with_partition_on: df_enrich.drop(columns=["p", "q"], inplace=True) cube = Cube( dimension_columns=["x"], partition_columns=["p", "q"], uuid_prefix="cube", seed_dataset="source", index_columns=["i1", "i2", "i3"], ) build_cube( data={"source": df_source, "enrich": df_enrich}, cube=cube, store=function_store, metadata={"source": {"userkey1": "value1"}}, partition_on={"enrich": []} if with_partition_on else None, ) return cube
def test_fail_no_store_factory(driver, function_store, skip_eager): df_seed = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13] }) df_enrich = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [10, 11, 12, 13] }) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube( data={ cube.seed_dataset: df_seed, "enrich": df_enrich }, cube=cube, store=function_store, ) store = function_store() with pytest.raises(TypeError) as exc: driver(cube=cube, store=store, no_run=True) assert str(exc.value) == "store must be a factory but is HFilesystemStore"
def test_stresstest_index_select_row(driver, function_store): n_indices = 100 n_rows = 1000 data = {"x": np.arange(n_rows), "p": 0} for i in range(n_indices): data["i{}".format(i)] = np.arange(n_rows) df = pd.DataFrame(data) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", index_columns=["i{}".format(i) for i in range(n_indices)], ) build_cube(data=df, cube=cube, store=function_store) conditions = Conjunction([(C("i{}".format(i)) == 0) for i in range(n_indices)]) result = driver( cube=cube, store=function_store, conditions=conditions, payload_columns=["p", "x"], ) assert len(result) == 1 df_actual = result[0] df_expected = df.loc[df["x"] == 0].reindex(columns=["p", "x"]) pdt.assert_frame_equal(df_actual, df_expected)
def sparse_outer_opt_cube( module_store, sparse_outer_data, sparse_outer_cube, sparse_outer_df, sparse_outer_opt_df, ): data = {} for dataset_id in sparse_outer_data.keys(): df = sparse_outer_data[dataset_id].copy() for col in sparse_outer_opt_df.columns: if col in df.columns: dtype = sparse_outer_opt_df[col].dtype if dtype == np.float64: dtype = np.int64 elif dtype == np.float32: dtype = np.int32 elif dtype == np.float16: dtype = np.int16 df[col] = df[col].astype(dtype) data[dataset_id] = df cube = sparse_outer_cube.copy(uuid_prefix="sparse_outer_opt_cube") build_cube(data=data, store=module_store, cube=cube) return cube
def existing_cube(function_store): df_source = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v1": [10, 11, 12, 13] }) df_enrich = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v2": [10, 11, 12, 13] }) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", index_columns=["i1", "i2", "i3"], ) build_cube(data={ "source": df_source, "enrich": df_enrich }, cube=cube, store=function_store) return cube
def test_simple_roundtrip(driver, function_store, function_store_rwro): df = pd.DataFrame({"x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13]}) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube(data=df, cube=cube, store=function_store) result = driver(cube=cube, store=function_store_rwro) assert len(result) == 1 df_actual = result[0] df_expected = df.reindex(columns=["p", "v", "x"]) pdt.assert_frame_equal(df_actual, df_expected)
def massive_partitions_cube(module_store, massive_partitions_data): cube = Cube( dimension_columns=["x", "y", "z"], partition_columns=["p", "q"], uuid_prefix="massive_partitions_cube", index_columns=["i1", "i2", "i3"], ) build_cube(data=massive_partitions_data, store=module_store, cube=cube) return cube
def fullrange_cube(module_store, fullrange_data): cube = Cube( dimension_columns=["x", "y", "z"], partition_columns=["p", "q"], uuid_prefix="fullrange_cube", index_columns=["i1", "i2", "i3"], ) build_cube(data=fullrange_data, store=module_store, cube=cube) return cube
def sparse_outer_cube(module_store, sparse_outer_data): cube = Cube( dimension_columns=["x", "y", "z"], partition_columns=["p", "q"], uuid_prefix="sparse_outer_cube", index_columns=["i1", "i2", "i3"], ) build_cube(data=sparse_outer_data, store=module_store, cube=cube) return cube
def test_wrong_condition_type(driver, function_store, driver_name): types = { "int": pd.Series([-1], dtype=np.int64), "uint": pd.Series([1], dtype=np.uint64), "float": pd.Series([1.3], dtype=np.float64), "bool": pd.Series([True], dtype=np.bool_), "str": pd.Series(["foo"], dtype=object), } cube = Cube( dimension_columns=["d_{}".format(t) for t in sorted(types.keys())], partition_columns=["p_{}".format(t) for t in sorted(types.keys())], uuid_prefix="typed_cube", index_columns=["i_{}".format(t) for t in sorted(types.keys())], ) data = { "seed": pd.DataFrame( { "{}_{}".format(prefix, t): types[t] for t in sorted(types.keys()) for prefix in ["d", "p", "v1"] } ), "enrich": pd.DataFrame( { "{}_{}".format(prefix, t): types[t] for t in sorted(types.keys()) for prefix in ["d", "p", "i", "v2"] } ), } build_cube(data=data, store=function_store, cube=cube) df = pd.DataFrame( { "{}_{}".format(prefix, t): types[t] for t in sorted(types.keys()) for prefix in ["d", "p", "i", "v1", "v2"] } ) for col in df.columns: t1 = col.split("_")[1] for t2 in sorted(types.keys()): cond = C(col) == types[t2].values[0] if t1 == t2: result = driver(cube=cube, store=function_store, conditions=cond) assert len(result) == 1 df_actual = result[0] df_expected = cond.filter_df(df).reset_index(drop=True) pdt.assert_frame_equal(df_actual, df_expected, check_like=True) else: with pytest.raises(TypeError) as exc: driver(cube=cube, store=function_store, conditions=cond) assert "has wrong type" in str(exc.value)
def test_condition_on_null(driver, function_store): df = pd.DataFrame({ "x": pd.Series([0, 1, 2], dtype=np.int64), "p": pd.Series([0, 0, 1], dtype=np.int64), "v_f1": pd.Series([0, np.nan, 2], dtype=np.float64), "v_f2": pd.Series([0, 1, np.nan], dtype=np.float64), "v_f3": pd.Series([np.nan, np.nan, np.nan], dtype=np.float64), "v_s1": pd.Series(["a", None, "c"], dtype=object), "v_s2": pd.Series(["a", "b", None], dtype=object), "v_s3": pd.Series([None, None, None], dtype=object), }) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="nulled_cube", index_columns=[], ) build_cube(data=df, store=function_store, cube=cube) for col in df.columns: # only iterate over the value columns (not the dimension / partition column): if not col.startswith("v"): continue # col_type will be either 'f' for float or 's' for string; see column # names above col_type = col.split("_")[1][0] if col_type == "f": value = 1.2 elif col_type == "s": value = "foo" else: raise RuntimeError("unknown type") cond = C(col) == value df_expected = cond.filter_df(df).reset_index(drop=True) result = driver(cube=cube, store=function_store, conditions=cond) if df_expected.empty: assert len(result) == 0 else: assert len(result) == 1 df_actual = result[0] pdt.assert_frame_equal(df_actual, df_expected, check_like=True)
def test_rowgroups_are_applied_when_df_serializer_is_passed_to_append_cube( driver, function_store, chunk_size_build, chunk_size_append): """ Test that the dataset is split into row groups depending on the chunk size Partitions build with ``chunk_size=None`` should keep a single row group after the append. Partitions that are newly created with ``chunk_size>0`` should be split into row groups accordingly. """ # Build cube df = pd.DataFrame( data={ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1] }, columns=["x", "p"], ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, cube=cube, store=function_store, df_serializer=ParquetSerializer(chunk_size=chunk_size_build), ) # Append to cube df_append = pd.DataFrame( data={ "x": [0, 1, 2, 3], "p": [2, 3, 3, 3] }, columns=["x", "p"], ) result = driver( data={"seed": df_append}, cube=cube, store=function_store, df_serializer=ParquetSerializer(chunk_size=chunk_size_append), ) dataset = result["seed"].load_all_indices(function_store()) part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3} part_chunk_size = { 0: chunk_size_build, 1: chunk_size_build, 2: chunk_size_append, 3: chunk_size_append, } assert len(dataset.partitions) == 4 assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
def built_cube(store, cube, df_source, df_enrich): with freeze_time(datetime(2018, 1, 31, 14, 3, 22)): build_cube(data={"source": df_source}, cube=cube, store=store) with freeze_time(datetime(2019, 2, 28, 13, 1, 17)): extend_cube( data={"enrich": df_enrich}, cube=cube, store=store, partition_on={"enrich": ["part", "q"]}, ) return cube
def test_cube_update_secondary_indices_subset(function_store, driver): cube1 = Cube( dimension_columns=["A"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", index_columns=["indexed"], ) df_1 = pd.DataFrame({"A": range(10), "P": 1, "indexed": 1, "not-indexed": 1}) build_cube( data={"source": df_1}, cube=cube1, store=function_store, metadata={"source": {"meta_at_create": "data"}}, ) cube2 = Cube( dimension_columns=["A"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", ) df_2 = pd.DataFrame({"A": range(10, 20), "P": 1, "indexed": 2, "not-indexed": 1}) driver( data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None ) dataset_uuid = cube2.ktk_dataset_uuid(cube2.seed_dataset) dm = DatasetMetadata.load_from_store( dataset_uuid, function_store(), load_all_indices=True ) obs_values = dm.indices["indexed"].observed_values() assert sorted(obs_values) == [1, 2] cube2 = Cube( dimension_columns=["A"], partition_columns=["P"], uuid_prefix="cube", seed_dataset="source", index_columns=["not-indexed"], ) with pytest.raises( ValueError, match='ExplicitSecondaryIndex or PartitionIndex "not-indexed" is missing in dataset', ): driver( data={"source": df_2}, cube=cube2, store=function_store, remove_conditions=None, )
def other_part_cube(module_store, data_no_part): cube = Cube( dimension_columns=["x", "y", "z"], partition_columns=["p", "q"], uuid_prefix="other_part_cube", index_columns=["i1", "i2", "i3"], ) build_cube( data=data_no_part, store=module_store, cube=cube, partition_on={"enrich_dense": ["i2"], "enrich_sparse": ["i3"]}, ) return cube
def test_delete_twice(driver, function_store): df = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13] }) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") build_cube(data=df, cube=cube, store=function_store) driver(cube=cube, store=function_store) driver(cube=cube, store=function_store) assert set(function_store().keys()) == set()
def test_single_rowgroup_when_df_serializer_is_not_passed_to_append_cube( driver, function_store): """ Test that the dataset has a single row group as default path """ # Build cube df = pd.DataFrame( data={ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1] }, columns=["x", "p"], ) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, cube=cube, store=function_store, ) # Append to cube df_append = pd.DataFrame( data={ "x": [0, 1, 2, 3], "p": [2, 3, 3, 3] }, columns=["x", "p"], ) result = driver( data={"seed": df_append}, cube=cube, store=function_store, ) dataset = result["seed"].load_all_indices(function_store()) part_num_rows = {0: 2, 1: 2, 2: 1, 3: 3} part_chunk_size = {0: None, 1: None, 2: None, 3: None} assert len(dataset.partitions) == 4 assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)
def test_keep_other(driver, function_store): df = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13] }) cube1 = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube1") cube2 = cube1.copy(uuid_prefix="cube2") build_cube(data=df, cube=cube1, store=function_store) keys = set(function_store().keys()) build_cube(data=df, cube=cube2, store=function_store) driver(cube=cube2, store=function_store) assert set(function_store().keys()) == keys
def test_rowgroups_are_applied_when_df_serializer_is_passed_to_update_cube( driver, function_store, chunk_size_build, chunk_size_update ): """ Test that the dataset is split into row groups depending on the chunk size Partitions build with ``chunk_size=None`` should keep a single row group if they are not touched by the update. Partitions that are newly created or replaced with ``chunk_size>0`` should be split into row groups accordingly. """ # Build cube df = pd.DataFrame(data={"x": [0, 1], "p": [0, 1]}, columns=["x", "p"],) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="rg-cube") build_cube( data=df, cube=cube, store=function_store, df_serializer=ParquetSerializer(chunk_size=chunk_size_build), ) # Update cube - replace p=1 and append p=2 partitions df_update = pd.DataFrame( data={"x": [0, 1, 2, 3], "p": [1, 1, 2, 2]}, columns=["x", "p"], ) result = driver( data={"seed": df_update}, remove_conditions=(C("p") == 1), # Remove p=1 partition cube=cube, store=function_store, df_serializer=ParquetSerializer(chunk_size=chunk_size_update), ) dataset = result["seed"].load_all_indices(function_store()) part_num_rows = {0: 1, 1: 2, 2: 2} part_chunk_size = { 0: chunk_size_build, 1: chunk_size_update, 2: chunk_size_update, } assert len(dataset.partitions) == 3 assert_num_row_groups(function_store(), dataset, part_num_rows, part_chunk_size)