def test_collect_dataset_metadata_concat(store_factory): """Smoke-test concatenation of empty and non-empty dataset metadata collections.""" df = pd.DataFrame(data={"A": [1, 1, 1, 1], "b": [1, 1, 2, 2]}) store_dataframes_as_dataset(store=store_factory, dataset_uuid="dataset_uuid", dfs=[df], partition_on=["A"]) df_stats1 = collect_dataset_metadata( store=store_factory, dataset_uuid="dataset_uuid", table_name="table", ).compute() # Remove all partitions of the dataset update_dataset_from_dataframes([], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{ "A": 1 }]) df_stats2 = collect_dataset_metadata( store=store_factory, dataset_uuid="dataset_uuid", table_name="table", ).compute() pd.concat([df_stats1, df_stats2])
def test_collect_dataset_metadata_delete_dataset(store_factory): df = pd.DataFrame(data={"A": [1, 1, 1, 1], "b": [1, 1, 2, 2]}) store_dataframes_as_dataset(store=store_factory, dataset_uuid="dataset_uuid", dfs=[df], partition_on=["A"]) # Remove all partitions of the dataset update_dataset_from_dataframes([], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{ "A": 1 }]) df_stats = collect_dataset_metadata( store=store_factory, dataset_uuid="dataset_uuid", ).compute() expected = pd.DataFrame(columns=_METADATA_SCHEMA) expected = expected.astype(_METADATA_SCHEMA) pd.testing.assert_frame_equal(expected, df_stats)
def test_empty_dataset(self, cube, function_store): expected = { cube.seed_dataset: store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name=cube.seed_dataset, ), "enrich": store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0], "v1": 100 }), name="enrich", metadata_storage_format="msgpack", ), } expected = { filter_ktk_cube_dataset_id: update_dataset_from_dataframes([], store=function_store, dataset_uuid=ds.uuid, delete_scope=[{}]) for filter_ktk_cube_dataset_id, ds in expected.items() } actual = discover_datasets(cube, function_store) assert_datasets_equal(actual, expected)
def append_to_cube(data, cube, store, metadata=None): """ Append data to existing cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". .. hint:: To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use :meth:`remove_partitions` beforehand. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) existing_datasets = discover_datasets(cube, store) partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=set(data.keys())) # do all data preparation before writing anything # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset data = _prepare_data_for_ktk_all(data=data, cube=cube, existing_payload=set(), partition_on=partition_on) # update_dataset_from_dataframes requires a store factory, so create one # if not provided if not callable(store): def store_factory(): return store else: store_factory = store updated_datasets = {} for ktk_cube_dataset_id, part in data.items(): updated_datasets[ktk_cube_dataset_id] = update_dataset_from_dataframes( store=store_factory, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), df_list=part, partition_on=list(partition_on[ktk_cube_dataset_id]), df_serializer=KTK_CUBE_DF_SERIALIZER, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), ) return apply_postwrite_checks( datasets=updated_datasets, cube=cube, store=store, existing_datasets=existing_datasets, )