def test_msgpack_efficiency(self, cube, function_store): """ We should only iterate over the store once, even though we are looking for 2 suffixes. Furthermore, we must only load every dataset once. """ store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name=cube.seed_dataset, metadata_storage_format="msgpack", ) store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name=cube.seed_dataset, overwrite=True, ) class StoreMock(KeyValueStore): def __init__(self, store): self._store = store self._iter_keys_called = 0 self._iter_prefixes_called = 0 self._get_called = Counter() def iter_keys(self, prefix=""): self._iter_keys_called += 1 return self._store.iter_keys(prefix) def iter_prefixes(self, delimiter, prefix=""): self._iter_prefixes_called += 1 return self._store.iter_prefixes(delimiter, prefix) def get(self, key): self._get_called[key] += 1 return self._store.get(key) store = StoreMock(function_store()) discover_datasets_unchecked(cube.uuid_prefix, store) assert store._iter_keys_called == 0 assert store._iter_prefixes_called == 1 assert max(store._get_called.values()) == 1
def test_partial_delete(driver, function_store): df_seed = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "v": [10, 11, 12, 13] }) df_1 = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "a": [20, 21, 22, 23] }) df_2 = pd.DataFrame({ "x": [0, 1, 2, 3], "p": [0, 0, 1, 1], "b": [20, 21, 22, 23] }) cube = Cube(dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube") datasets = build_cube( data={ cube.seed_dataset: df_seed, "enrich-1": df_1, "enrich-2": df_2 }, cube=cube, store=function_store, ) enrich_1_keys = get_dataset_keys( discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=function_store, filter_ktk_cube_dataset_ids=["enrich-1"], )["enrich-1"]) enrich_2_keys = get_dataset_keys( discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=function_store, filter_ktk_cube_dataset_ids=["enrich-2"], )["enrich-2"]) all_keys = set(function_store().keys()) driver(cube=cube, store=function_store, datasets=["enrich-1"]) assert set(function_store().keys()) == all_keys - enrich_1_keys driver(cube=cube, store=function_store, datasets={"enrich-2": datasets["enrich-2"]}) assert set( function_store().keys()) == all_keys - enrich_1_keys - enrich_2_keys
def cleanup_cube(cube, store): """ Remove unused keys from cube datasets. .. important:: All untracked keys which start with the cube's `uuid_prefix` followed by the `KTK_CUBE_UUID_SEPERATOR` (e.g. `my_cube_uuid++seed...`) will be deleted by this routine. These keys may be leftovers from past overwrites or index updates. Parameters ---------- cube: Cube Cube specification. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] KV store. """ if callable(store): store = store() datasets = discover_datasets_unchecked(uuid_prefix=cube.uuid_prefix, store=store) keys = get_keys_to_clean(cube.uuid_prefix, datasets, store) for k in sorted(keys): store.delete(k)
def test_partial_copy_exclude_pattern( cli, built_cube, skv, store, store2, exclude_pattern, copy_tables ): extend_cube( data={ "mytable": pd.DataFrame( { "x": [0, 1], "y": [0, 0], "p": 0, "q": ["a", "a"], "mycolumn": ["a", "b"], } ) }, cube=built_cube, store=store, ) copied_datasets = discover_datasets_unchecked( uuid_prefix=built_cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=copy_tables, ) copy_keys = set() for name in copy_tables: copy_keys |= get_dataset_keys(copied_datasets[name]) result = cli( "--store=cubes", "my_cube", "copy", "--tgt_store=cubes2", "--exclude=" + exclude_pattern, ) assert result.exit_code == 0 assert set(store2.keys()) == copy_keys
def test_msgpack_clean(self, cube, function_store): expected = { cube.seed_dataset: store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name=cube.seed_dataset, ), "enrich": store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name="enrich", metadata_storage_format="msgpack", ), } actual = discover_datasets_unchecked(cube.uuid_prefix, function_store) assert_datasets_equal(actual, expected)
def test_filter_partial_datasets_found(self, cube, function_store): enrich_dataset = store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name="enrich", ) store_data( cube=cube, function_store=function_store, df=pd.DataFrame({ "x": [0], "y": [0], "p": [0], "q": [0] }), name="mytable", ) expected = {"enrich": enrich_dataset} actual = discover_datasets_unchecked( cube.uuid_prefix, function_store, filter_ktk_cube_dataset_ids=["enrich"]) assert_dataset_issubset(actual, expected)
def test_no_common_metadata(self, cube, function_store): expected = { cube.seed_dataset: store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), name=cube.seed_dataset, ) } store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), name="enrich", ) keys = set(function_store().keys()) metadata_key = cube.ktk_dataset_uuid("enrich") + ".by-dataset-metadata.json" assert metadata_key in keys for k in keys: if (k != metadata_key) and k.startswith(cube.ktk_dataset_uuid("enrich")): function_store().delete(k) actual = discover_datasets_unchecked(cube.uuid_prefix, function_store) assert_datasets_equal(actual, expected)
def test_msgpack_priority(self, cube, function_store): """ json metadata files have priority in kartothek, so the disovery should respect this """ store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v1": [0]}), name=cube.seed_dataset, metadata_storage_format="msgpack", ) expected = { cube.seed_dataset: store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v2": [0]}), name=cube.seed_dataset, overwrite=True, ) } store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0], "v3": [0]}), name=cube.seed_dataset, metadata_storage_format="msgpack", overwrite=True, ) actual = discover_datasets_unchecked(cube.uuid_prefix, function_store) assert_datasets_equal(actual, expected)
def cleanup_cube_bag(cube, store, blocksize=100): """ Remove unused keys from cube datasets. .. important:: All untracked keys which start with the cube's `uuid_prefix` followed by the `KTK_CUBE_UUID_SEPERATOR` (e.g. `my_cube_uuid++seed...`) will be deleted by this routine. These keys may be leftovers from past overwrites or index updates. Parameters ---------- cube: Cube Cube specification. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] KV store. blocksize: int Number of keys to delete at once. Returns ------- bag: dask.bag.Bag A dask bag that performs the given operation. May contain multiple partitions. """ check_store_factory(store) check_blocksize(blocksize) store_obj = store() datasets = discover_datasets_unchecked(uuid_prefix=cube.uuid_prefix, store=store) keys = get_keys_to_clean(cube.uuid_prefix, datasets, store_obj) return db.from_sequence( seq=sorted(keys), partition_size=blocksize).map_partitions(_delete, store=store)
def delete_cube(cube, store, datasets=None): """ Delete cube from store. .. important:: This routine only deletes tracked files. Garbage and leftovers from old cubes and failed operations are NOT removed. Parameters ---------- cube: Cube Cube specification. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] KV store. datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]] Datasets to delete, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be deleted). """ if callable(store): store = store() if not isinstance(datasets, dict): datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=datasets, ) keys = set() for ktk_cube_dataset_id in sorted(datasets.keys()): ds = datasets[ktk_cube_dataset_id] keys |= get_dataset_keys(ds) for k in sorted(keys): store.delete(k)
def collect_stats(cube, store, datasets=None): """ Collect statistics for given cube. Parameters ---------- cube: Cube Cube specification. store: simplekv.KeyValueStore KV store that preserves the cube. datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]] Datasets to query, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case auto-discovery will be used). Returns ------- stats: Dict[str, Dict[str, int]] Statistics per ktk_cube dataset ID. """ if callable(store): store = store() if not isinstance(datasets, dict): datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=datasets, ) all_metapartitions = get_metapartitions_for_stats(datasets) return reduce_stats([collect_stats_block(all_metapartitions, store)])
def test_no_seed(self, cube, function_store): expected = { "enrich": store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), name="enrich", ) } actual = discover_datasets_unchecked(cube.uuid_prefix, function_store) assert_datasets_equal(actual, expected)
def test_partial_copy_include_pattern_nomatch(cli, built_cube, skv, store, store2): copied_datasets = discover_datasets_unchecked( uuid_prefix=built_cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=["source"], ) copy_keys = get_dataset_keys(copied_datasets["source"]) # noqa result = cli( "--store=cubes", "my_cube", "copy", "--tgt_store=cubes2", "--include=x*,source" ) assert result.exit_code == 2 assert "Error: Could not find dataset x*" in result.output
def test_other_files(self, cube, function_store): expected = { cube.seed_dataset: store_data( cube=cube, function_store=function_store, df=pd.DataFrame({"x": [0], "y": [0], "p": [0], "q": [0]}), name=cube.seed_dataset, ) } function_store().put(cube.ktk_dataset_uuid("enrich") + "/foo", b"") actual = discover_datasets_unchecked(cube.uuid_prefix, function_store) assert_datasets_equal(actual, expected)
def collect_stats_bag( cube: Cube, store: StoreFactory, datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None, blocksize: int = 100, ): """ Collect statistics for given cube. Parameters ---------- cube Cube specification. store KV store that preserves the cube. datasets Datasets to query, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case auto-discovery will be used). blocksize Number of partitions to scan at once. Returns ------- bag: dask.bag.Bag A dask bag that returns a single result of the form ``Dict[str, Dict[str, int]]`` and contains statistics per ktk_cube dataset ID. """ check_store_factory(store) check_blocksize(blocksize) if not isinstance(datasets, dict): datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=datasets, ) all_metapartitions = get_metapartitions_for_stats(datasets) return ( db.from_sequence(seq=all_metapartitions, partition_size=blocksize) .map_partitions(collect_stats_block, store=store) .reduction( perpartition=_obj_to_list, aggregate=_reduce_stats, split_every=False, out_type=db.Bag, ) )
def test_partial_delete_exclude_pattern( cli, built_cube, skv, store, exclude_pattern, delete_tables ): datasets = discover_datasets_unchecked( uuid_prefix=built_cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=delete_tables, ) delete_keys = set() for name in delete_tables: delete_keys |= get_dataset_keys(datasets[name]) all_keys = set(store.keys()) result = cli("--store=cubes", "my_cube", "delete", "--exclude=" + exclude_pattern) assert result.exit_code == 0 assert set(store.keys()) == all_keys - delete_keys
def delete_cube_bag( cube: Cube, store: StoreFactory, blocksize: int = 100, datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None, ): """ Delete cube from store. .. important:: This routine only deletes tracked files. Garbage and leftovers from old cubes and failed operations are NOT removed. Parameters ---------- cube Cube specification. store KV store. blocksize Number of keys to delete at once. datasets Datasets to delete, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be deleted). Returns ------- bag: dask.bag.Bag A dask bag that performs the given operation. May contain multiple partitions. """ check_store_factory(store) check_blocksize(blocksize) if not isinstance(datasets, dict): datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=store, filter_ktk_cube_dataset_ids=datasets, ) keys = set() for ktk_cube_dataset_id in sorted(datasets.keys()): ds = datasets[ktk_cube_dataset_id] keys |= get_dataset_keys(ds) return db.from_sequence(seq=sorted(keys), partition_size=blocksize).map_partitions( _delete, store=store )
def test_partial_copy_dataset_dict( driver, function_store, function_store2, cube, built_cube ): driver( cube=cube, src_store=function_store, tgt_store=function_store2, datasets={"seed": built_cube["seed"], "enrich": built_cube["enrich"]}, ) all_datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=function_store, filter_ktk_cube_dataset_ids=["seed", "enrich"], ) copied_ds_keys = set() copied_ds_keys |= get_dataset_keys(all_datasets["seed"]) copied_ds_keys |= get_dataset_keys(all_datasets["enrich"]) tgt_store_keys = set(function_store2().keys()) assert copied_ds_keys == tgt_store_keys
def get_datasets_to_copy( cube: Cube, src_store: Union[Callable[[], KeyValueStore], KeyValueStore], tgt_store: Union[Callable[[], KeyValueStore], KeyValueStore], overwrite: bool, datasets: Optional[Union[Iterable[str], Dict[str, DatasetMetadata]]] = None, ) -> Dict[str, DatasetMetadata]: """ Determine all dataset names of a given cube that should be copied and apply addtional consistency checks. Copying only a specific set of datasets is possible by providing a list of dataset names via the parameter `datasets`. Parameters ---------- cube: Cube specification. src_store: Source KV store. tgt_store: Target KV store. overwrite: If possibly existing datasets in the target store should be overwritten. datasets: Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, an iterable of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). Returns ------- all_datasets: Dict[str, DatasetMetadata] All datasets that should be copied. """ if not isinstance(datasets, dict): new_datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=src_store, filter_ktk_cube_dataset_ids=datasets, ) else: new_datasets = datasets if datasets is None: if not new_datasets: raise RuntimeError("{} not found in source store".format(cube)) else: unknown_datasets = set(datasets) - set(new_datasets) if unknown_datasets: raise RuntimeError( "{cube}, datasets {datasets} do not exist in source store". format(cube=cube, datasets=unknown_datasets)) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, tgt_store) if not overwrite: for ktk_cube_dataset_id in sorted(new_datasets.keys()): if ktk_cube_dataset_id in existing_datasets: raise RuntimeError( 'Dataset "{uuid}" exists in target store but overwrite was set to False' .format(uuid=new_datasets[ktk_cube_dataset_id].uuid)) all_datasets = copy(existing_datasets) all_datasets.update(new_datasets) check_datasets(all_datasets, cube) return new_datasets
def build_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that builds a cube with the data supplied from a dask bag. Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. If left unprovided, it is assumed that only the seed dataset will be written. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to build a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids is None: ktk_cube_dataset_ids = [cube.seed_dataset] else: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) cube = ensure_valid_cube_indices(existing_datasets, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=set(), partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def test_filter_no_datasets_found(self, cube, function_store): actual = discover_datasets_unchecked( cube.uuid_prefix, function_store, filter_ktk_cube_dataset_ids=["enrich"] ) assert actual == {}
def build_cube_from_dataframe( data: Union[dd.DataFrame, Dict[str, dd.DataFrame]], cube: Cube, store: StoreFactory, metadata: Optional[Dict[str, Dict[str, Any]]] = None, overwrite: bool = False, partition_on: Optional[Dict[str, Iterable[str]]] = None, shuffle: bool = False, num_buckets: int = 1, bucket_by: Optional[Iterable[str]] = None, df_serializer: Optional[ParquetSerializer] = None, ) -> Delayed: """ Create dask computation graph that builds a cube with the data supplied from a dask dataframe. Parameters ---------- data Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube Cube specification. store Store to which the data should be written to. metadata Metadata for every dataset. overwrite If possibly existing datasets should be overwritten. partition_on Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.delayed.Delayed A dask delayed object containing the compute graph to build a cube returning the dict of dataset metadata objects. """ check_store_factory(store) if not isinstance(data, dict): data = {cube.seed_dataset: data} ktk_cube_dataset_ids = sorted(data.keys()) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets) partition_on_checked = prepare_ktk_partition_on( cube, ktk_cube_dataset_ids, partition_on ) del partition_on dct = {} for table_name, ddf in data.items(): check_user_df(table_name, ddf, cube, set(), partition_on_checked[table_name]) indices_to_build = set(cube.index_columns) & set(ddf.columns) if table_name == cube.seed_dataset: indices_to_build |= set(cube.dimension_columns) - cube.suppress_index_on indices_to_build -= set(partition_on_checked[table_name]) ddf = ddf.map_partitions( assert_dimesion_index_cols_notnull, ktk_cube_dataset_id=table_name, cube=cube, partition_on=partition_on_checked[table_name], meta=ddf._meta, ) graph = store_dataset_from_ddf( ddf, dataset_uuid=cube.ktk_dataset_uuid(table_name), store=store, metadata=prepare_ktk_metadata(cube, table_name, metadata), partition_on=partition_on_checked[table_name], secondary_indices=sorted(indices_to_build), sort_partitions_by=sorted( (set(cube.dimension_columns) - set(cube.partition_columns)) & set(ddf.columns) ), overwrite=overwrite, shuffle=shuffle, num_buckets=num_buckets, bucket_by=bucket_by, df_serializer=df_serializer, ) dct[table_name] = graph return dask.delayed(apply_postwrite_checks)( dct, cube=cube, store=store, existing_datasets=existing_datasets )
def get_copy_keys(cube, src_store, tgt_store, overwrite, datasets=None): """ Get and check keys that should be copied from one store to another. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube specification. src_store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] Source KV store. tgt_store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] Target KV store. overwrite: bool If possibly existing datasets in the target store should be overwritten. datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]] Datasets to copy, must all be part of the cube. May be either the result of :meth:`discover_datasets`, an iterable of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). Returns ------- keys: Set[str] Set of keys to copy. Raises ------ RuntimeError: In case the copy would not pass successfully or if there is no cube in ``src_store``. """ if not isinstance(datasets, dict): new_datasets = discover_datasets_unchecked( uuid_prefix=cube.uuid_prefix, store=src_store, filter_ktk_cube_dataset_ids=datasets, ) else: new_datasets = datasets if datasets is None: if not new_datasets: raise RuntimeError("{} not found in source store".format(cube)) else: unknown_datasets = set(datasets) - set(new_datasets) if unknown_datasets: raise RuntimeError( "{cube}, datasets {datasets} do not exist in source store". format(cube=cube, datasets=unknown_datasets)) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, tgt_store) if not overwrite: for ktk_cube_dataset_id in sorted(new_datasets.keys()): if ktk_cube_dataset_id in existing_datasets: raise RuntimeError( 'Dataset "{uuid}" exists in target store but overwrite was set to False' .format(uuid=new_datasets[ktk_cube_dataset_id].uuid)) all_datasets = copy(existing_datasets) all_datasets.update(new_datasets) check_datasets(all_datasets, cube) keys = set() for ktk_cube_dataset_id in sorted(new_datasets.keys()): ds = new_datasets[ktk_cube_dataset_id] keys |= get_dataset_keys(ds) return keys
def build_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes as Ktk_cube cube. ``data`` can be formatted in multiple ways: - single DataFrame:: pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v': [42, 45, 20, 10], }) In that case, the seed dataset will be written. - dictionary of DataFrames:: { 'seed': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), 'enrich': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v2': [False, False, True, False], }), } In that case, multiple datasets can be written at the same time. Note that the seed dataset MUST be included. - list of anything above:: [ # seed data only pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), # seed data only, explicit way { 'seed': pd.DataFrame({ 'x': [4, 5, 6, 7], 'p': [0, 0, 1, 1], 'v1': [12, 32, 22, 9], }), }, # multiple datasets { 'seed': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v1': [9, 2, 4, 11], }), 'enrich': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v2': [True, True, False, False], }), }, # non-seed data only { 'enrich': pd.DataFrame({ 'x': [1, 2, 3, 4], 'p': [0, 0, 1, 1], 'v2': [False, True, False, False], }), }, ] In that case, multiple datasets may be written. Note that at least a single list element must contain seed data. Extra metdata may be preserved w/ every dataset, e.g.:: { 'seed': { 'source': 'db', 'host': 'db1.cluster20.company.net', 'last_event': '230c6edb-b69a-4d30-b56d-28f5dfe20948', }, 'enrich': { 'source': 'python', 'commit_hash': '8b5d717518439921e6d17c7495956bdad687bc54', }, } Note that the given data must be JSON-serializable. If the cube already exists, the ``overwrite`` flag must be given. In that case, all datasets that are part of the existing cube must be overwritten. Partial overwrites are not allowed. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(data, cube, existing_datasets) # do all data preparation before writing anything data = _prepare_data_for_ktk_all(data=data, cube=cube, existing_payload=set(), partition_on=partition_on) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def copy_cube( cube: Cube, src_store: Union[KeyValueStore, Callable[[], KeyValueStore]], tgt_store: Union[KeyValueStore, Callable[[], KeyValueStore]], overwrite: bool = False, datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]] = None, renamed_cube_prefix: Optional[str] = None, renamed_datasets: Optional[Dict[str, str]] = None, ): """ Copy cube from one store to another. .. warning:: A failing copy operation can not be rolled back if the `overwrite` flag is enabled and might leave the overwritten dataset in an inconsistent state. Parameters ---------- cube: Cube Cube specification. src_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Source KV store. tgt_store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Target KV store. overwrite: bool If possibly existing datasets in the target store should be overwritten. datasets: Union[None, Iterable[str], Dict[str, DatasetMetadata]] Datasets to copy, must all be part of the cube. May be either the result of :func:`~kartothek.api.discover.discover_datasets`, a list of Ktk_cube dataset ID or ``None`` (in which case entire cube will be copied). renamed_cube_prefix: Optional[str] Optional new cube prefix. If specified, the cube will be renamed while copying. renamed_datasets: Optional[Dict[str, str]] Optional dict with {old dataset name: new dataset name} entries. If provided, the datasets will be renamed accordingly during copying. When the parameter datasets is specified, the datasets to rename must be a subset of the datasets to copy. """ if callable(src_store): src_store = src_store() if callable(tgt_store): tgt_store = tgt_store() assert_stores_different(src_store, tgt_store, cube.ktk_dataset_uuid(cube.seed_dataset)) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, tgt_store) if renamed_datasets is None: new_seed_dataset = cube.seed_dataset else: new_seed_dataset = renamed_datasets.get(cube.seed_dataset, cube.seed_dataset) new_cube = Cube( dimension_columns=cube.dimension_columns, partition_columns=cube.partition_columns, uuid_prefix=renamed_cube_prefix or cube.uuid_prefix, seed_dataset=new_seed_dataset, index_columns=cube.index_columns, suppress_index_on=cube.suppress_index_on, ) datasets_to_copy = get_datasets_to_copy( cube=cube, src_store=src_store, tgt_store=tgt_store, overwrite=overwrite, datasets=datasets, ) copied = {} # type: Dict[str, DatasetMetadata] for src_ds_name, src_ds_meta in datasets_to_copy.items(): tgt_ds_uuid = _transform_uuid( src_uuid=src_ds_meta.uuid, cube_prefix=cube.uuid_prefix, renamed_cube_prefix=renamed_cube_prefix, renamed_datasets=renamed_datasets, ) try: md_transformed = copy_dataset( source_dataset_uuid=src_ds_meta.uuid, store=src_store, target_dataset_uuid=tgt_ds_uuid, target_store=tgt_store, ) except Exception as e: if overwrite: # We can't roll back safely if the target dataset has been partially overwritten. raise RuntimeError(e) else: apply_postwrite_checks( datasets=copied, cube=new_cube, store=tgt_store, existing_datasets=existing_datasets, ) else: copied.update(md_transformed)