def test_check_provided_metadata_dict_unknown_ids(): with pytest.raises( ValueError, match= "Provided metadata for otherwise unspecified ktk_cube_dataset_ids: a, b", ): check_provided_metadata_dict({"a": {}, "b": {}, "c": {}}, ["c"])
def build_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes as Ktk_cube cube. ``data`` can be formatted in multiple ways: - single DataFrame:: pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v': [42, 45, 20, 10], }) In that case, the seed dataset will be written. - dictionary of DataFrames:: { 'seed': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), 'enrich': pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v2': [False, False, True, False], }), } In that case, multiple datasets can be written at the same time. Note that the seed dataset MUST be included. - list of anything above:: [ # seed data only pd.DataFrame({ 'x': [0, 1, 2, 3], 'p': [0, 0, 1, 1], 'v1': [42, 45, 20, 10], }), # seed data only, explicit way { 'seed': pd.DataFrame({ 'x': [4, 5, 6, 7], 'p': [0, 0, 1, 1], 'v1': [12, 32, 22, 9], }), }, # multiple datasets { 'seed': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v1': [9, 2, 4, 11], }), 'enrich': pd.DataFrame({ 'x': [8, 9, 10, 11], 'p': [0, 0, 1, 1], 'v2': [True, True, False, False], }), }, # non-seed data only { 'enrich': pd.DataFrame({ 'x': [1, 2, 3, 4], 'p': [0, 0, 1, 1], 'v2': [False, True, False, False], }), }, ] In that case, multiple datasets may be written. Note that at least a single list element must contain seed data. Extra metdata may be preserved w/ every dataset, e.g.:: { 'seed': { 'source': 'db', 'host': 'db1.cluster20.company.net', 'last_event': '230c6edb-b69a-4d30-b56d-28f5dfe20948', }, 'enrich': { 'source': 'python', 'commit_hash': '8b5d717518439921e6d17c7495956bdad687bc54', }, } Note that the given data must be JSON-serializable. If the cube already exists, the ``overwrite`` flag must be given. In that case, all datasets that are part of the existing cube must be overwritten. Partial overwrites are not allowed. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(data, cube, existing_datasets) # do all data preparation before writing anything data = _prepare_data_for_ktk_all(data=data, cube=cube, existing_payload=set(), partition_on=partition_on) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def extend_cube(data, cube, store, metadata=None, overwrite=False, partition_on=None): """ Store given dataframes into an existing Kartothek cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. Parameters ---------- data: Union[pd.DataFrame, Dict[str, pd.DataFrame], List[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]] Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube: kartothek.core.cube.cube.Cube Cube specification. store: simplekv.KeyValueStore Store to which the data should be written to. metadata: Optional[Dict[str, Dict[str, Any]]] Metadata for every dataset. overwrite: bool If possibly existing datasets should be overwritten. partition_on: Optional[Dict[str, Iterable[str]]] Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). See :ref:`Dimensionality and Partitioning Details` for details. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] DatasetMetadata for every dataset written. """ data = _normalize_user_input(data, cube) ktk_cube_dataset_ids = set(data.keys()) partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) check_datasets_preextend(data, cube) existing_datasets = discover_datasets(cube, store) if overwrite: existing_datasets_cut = { ktk_cube_dataset_id: ds for ktk_cube_dataset_id, ds in existing_datasets.items() if ktk_cube_dataset_id not in data } else: existing_datasets_cut = existing_datasets existing_payload = get_cube_payload(existing_datasets_cut, cube) # do all data preparation before writing anything data = _prepare_data_for_ktk_all( data=data, cube=cube, existing_payload=existing_payload, partition_on=partition_on, ) datasets = {} for ktk_cube_dataset_id, part in data.items(): datasets[ktk_cube_dataset_id] = store_dataframes_as_dataset( store=store, dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id), dfs=part, metadata=prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata), partition_on=list(partition_on[ktk_cube_dataset_id]), metadata_storage_format=KTK_CUBE_METADATA_STORAGE_FORMAT, metadata_version=KTK_CUBE_METADATA_VERSION, df_serializer=KTK_CUBE_DF_SERIALIZER, overwrite=overwrite, ) return apply_postwrite_checks(datasets=datasets, cube=cube, store=store, existing_datasets=existing_datasets)
def test_check_provided_metadata_dict_wrong_type_nested(): with pytest.raises( TypeError, match="Provided metadata for dataset a should be a dict but is list" ): check_provided_metadata_dict({"a": []}, ["a"])
def test_check_provided_metadata_dict_wrong_type(): with pytest.raises( TypeError, match="Provided metadata should be a dict but is list" ): check_provided_metadata_dict([], [])
def build_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that builds a cube with the data supplied from a dask bag. Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. If left unprovided, it is assumed that only the seed dataset will be written. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to build a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids is None: ktk_cube_dataset_ids = [cube.seed_dataset] else: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) cube = ensure_valid_cube_indices(existing_datasets, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=set(), partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def append_to_cube_from_bag_internal( data: db.Bag, cube: Cube, store: StoreFactory, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], remove_conditions=None, df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Append data to existing cube. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". Parameters ---------- data: dask.bag.Bag Bag containing dataframes cube: Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. remove_conditions: Conditions that select which partitions to remove. df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset existing_payload: Set[str] = set() partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=ktk_cube_dataset_ids) if remove_conditions is not None: remove_metapartitions = prepare_metapartitions_for_removal_action( cube, store, remove_conditions, ktk_cube_dataset_ids, existing_datasets) delete_scopes = { k: delete_scope for k, (_, _, delete_scope) in remove_metapartitions.items() } else: delete_scopes = {} data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _fill_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, update=True, existing_datasets=existing_datasets, delete_scopes=delete_scopes, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def extend_cube_from_bag_internal( data: db.Bag, cube: Cube, store: KeyValueStore, ktk_cube_dataset_ids: Optional[Iterable[str]], metadata: Optional[Dict[str, Dict[str, Any]]], overwrite: bool, partition_on: Optional[Dict[str, Iterable[str]]], df_serializer: Optional[ParquetSerializer] = None, ) -> db.Bag: """ Create dask computation graph that extends a cube by the data supplied from a dask bag. For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`. Parameters ---------- data: dask.bag.Bag Bag containing dataframes (see :func:`~kartothek.io.eager_cube.build_cube` for possible format and types). cube: kartothek.core.cube.cube.Cube Cube specification. store: Store to which the data should be written to. ktk_cube_dataset_ids: Datasets that will be written, must be specified in advance. metadata: Metadata for every dataset. overwrite: If possibly existing datasets should be overwritten. partition_on: Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to extend a cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) check_datasets_preextend(ktk_cube_dataset_ids, cube) if ktk_cube_dataset_ids: ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) else: ktk_cube_dataset_ids = [] metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) prep_partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on) existing_datasets = discover_datasets(cube, store) cube = ensure_valid_cube_indices(existing_datasets, cube) if overwrite: existing_datasets_cut = { ktk_cube_dataset_id: ds for ktk_cube_dataset_id, ds in existing_datasets.items() if ktk_cube_dataset_id not in ktk_cube_dataset_ids } else: existing_datasets_cut = existing_datasets existing_payload = get_cube_payload(existing_datasets_cut, cube) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=prep_partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, overwrite=overwrite, update=False, existing_datasets=existing_datasets, df_serializer=df_serializer, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data
def build_cube_from_dataframe( data: Union[dd.DataFrame, Dict[str, dd.DataFrame]], cube: Cube, store: StoreFactory, metadata: Optional[Dict[str, Dict[str, Any]]] = None, overwrite: bool = False, partition_on: Optional[Dict[str, Iterable[str]]] = None, shuffle: bool = False, num_buckets: int = 1, bucket_by: Optional[Iterable[str]] = None, df_serializer: Optional[ParquetSerializer] = None, ) -> Delayed: """ Create dask computation graph that builds a cube with the data supplied from a dask dataframe. Parameters ---------- data Data that should be written to the cube. If only a single dataframe is given, it is assumed to be the seed dataset. cube Cube specification. store Store to which the data should be written to. metadata Metadata for every dataset. overwrite If possibly existing datasets should be overwritten. partition_on Optional parition-on attributes for datasets (dictionary mapping :term:`Dataset ID` -> columns). df_serializer: Optional Dataframe to Parquet serializer Returns ------- metadata_dict: dask.delayed.Delayed A dask delayed object containing the compute graph to build a cube returning the dict of dataset metadata objects. """ check_store_factory(store) if not isinstance(data, dict): data = {cube.seed_dataset: data} ktk_cube_dataset_ids = sorted(data.keys()) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store) check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets) partition_on_checked = prepare_ktk_partition_on( cube, ktk_cube_dataset_ids, partition_on ) del partition_on dct = {} for table_name, ddf in data.items(): check_user_df(table_name, ddf, cube, set(), partition_on_checked[table_name]) indices_to_build = set(cube.index_columns) & set(ddf.columns) if table_name == cube.seed_dataset: indices_to_build |= set(cube.dimension_columns) - cube.suppress_index_on indices_to_build -= set(partition_on_checked[table_name]) ddf = ddf.map_partitions( assert_dimesion_index_cols_notnull, ktk_cube_dataset_id=table_name, cube=cube, partition_on=partition_on_checked[table_name], meta=ddf._meta, ) graph = store_dataset_from_ddf( ddf, dataset_uuid=cube.ktk_dataset_uuid(table_name), store=store, metadata=prepare_ktk_metadata(cube, table_name, metadata), partition_on=partition_on_checked[table_name], secondary_indices=sorted(indices_to_build), sort_partitions_by=sorted( (set(cube.dimension_columns) - set(cube.partition_columns)) & set(ddf.columns) ), overwrite=overwrite, shuffle=shuffle, num_buckets=num_buckets, bucket_by=bucket_by, df_serializer=df_serializer, ) dct[table_name] = graph return dask.delayed(apply_postwrite_checks)( dct, cube=cube, store=store, existing_datasets=existing_datasets )
def append_to_cube_from_bag_internal(data, cube, store, ktk_cube_dataset_ids, metadata): """ Append data to existing cube. For details on ``data`` and ``metadata``, see :meth:`build_cube`. .. important:: Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the old data is treated as "removed". .. hint:: To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use :meth:`remove_partitions` beforehand. Parameters ---------- data: dask.Bag Bag containing dataframes cube: kartothek.core.cube.cube.Cube Cube specification. store: Callable[[], simplekv.KeyValueStore] Store to which the data should be written to. ktk_cube_dataset_ids: Optional[Iterable[str]] Datasets that will be written, must be specified in advance. metadata: Dict[str, Dict[str, Any]] Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of metadata keys is not possible. Returns ------- metadata_dict: dask.bag.Bag A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata objects. The bag has a single partition with a single element. """ check_store_factory(store) ktk_cube_dataset_ids = sorted(ktk_cube_dataset_ids) metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids) existing_datasets = discover_datasets(cube, store) # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the # compat check within 1 dataset existing_payload = set() partition_on = {k: v.partition_keys for k, v in existing_datasets.items()} check_existing_datasets(existing_datasets=existing_datasets, ktk_cube_dataset_ids=ktk_cube_dataset_ids) data = (data.map(multiplex_user_input, cube=cube).map( _check_dataset_ids, ktk_cube_dataset_ids=ktk_cube_dataset_ids).map( _multiplex_prepare_data_for_ktk, cube=cube, existing_payload=existing_payload, partition_on=partition_on, )) data = _store_bag_as_dataset_parallel( bag=data, store=store, cube=cube, ktk_cube_dataset_ids=ktk_cube_dataset_ids, metadata={ ktk_cube_dataset_id: prepare_ktk_metadata(cube, ktk_cube_dataset_id, metadata) for ktk_cube_dataset_id in ktk_cube_dataset_ids }, update=True, existing_datasets=existing_datasets, ) data = data.map( apply_postwrite_checks, cube=cube, store=store, existing_datasets=existing_datasets, ) return data