def append_to_cube_from_dataframe(
    data: db.Bag,
    cube: Cube,
    store: KeyValueStore,
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".

    .. hint::

        To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use
        :func:`~kartothek.io.eager_cube.remove_partitions` beforehand.

    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    metadata:
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    data, ktk_cube_dataset_ids = _ddfs_to_bag(data, cube)

    return (
        append_to_cube_from_bag_internal(
            data=data,
            cube=cube,
            store=store,
            ktk_cube_dataset_ids=ktk_cube_dataset_ids,
            metadata=metadata,
            df_serializer=df_serializer,
        )
        .map_partitions(_unpack_list, default=None)
        .to_delayed()[0]
    )
예제 #2
0
def update_cube_from_bag(
    data: db.Bag,
    cube: Cube,
    store: StoreFactory,
    remove_conditions,
    ktk_cube_dataset_ids: Optional[Iterable[str]],
    metadata: Optional[Dict[str, Dict[str, Any]]] = None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Remove partitions and append data to existing cube.

    For details on ``data`` and ``metadata``, see :func:`~kartothek.io.eager_cube.build_cube`.

    Only datasets in `ktk_cube_dataset_ids` will be affected.

    Parameters
    ----------
    data: dask.bag.Bag
        Bag containing dataframes
    cube:
        Cube specification.
    store:
        Store to which the data should be written to.
    remove_conditions
        Conditions that select the partitions to remove. Must be a condition that only uses
        partition columns.
    ktk_cube_dataset_ids:
        Datasets that will be written, must be specified in advance.
    metadata:
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.
    df_serializer:
        Optional Dataframe to Parquet serializer

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.

    See Also
    --------
    :ref:`mutating_datasets`
    """
    return append_to_cube_from_bag_internal(
        data=data,
        cube=cube,
        store=store,
        remove_conditions=remove_conditions,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata=metadata,
        df_serializer=df_serializer,
    )
예제 #3
0
def append_to_cube_from_bag(data,
                            cube,
                            store,
                            ktk_cube_dataset_ids,
                            metadata=None):
    """
    Append data to existing cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    .. important::

        Physical partitions must be updated as a whole. If only single rows within a physical partition are updated, the
        old data is treated as "removed".

    .. hint::

        To have better control over the overwrite "mask" (i.e. which partitions are overwritten), you should use
        :meth:`remove_partitions` beforehand or use :meth:`update_cube_from_bag` instead.

    Parameters
    ----------
    data: dask.Bag
        Bag containing dataframes
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: Callable[[], simplekv.KeyValueStore]
        Store to which the data should be written to.
    ktk_cube_dataset_ids: Optional[Iterable[str]]
        Datasets that will be written, must be specified in advance.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    return append_to_cube_from_bag_internal(
        data=data,
        cube=cube,
        store=store,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata=metadata,
    )
예제 #4
0
def update_cube_from_bag(data,
                         cube,
                         store,
                         remove_conditions,
                         ktk_cube_dataset_ids,
                         metadata=None):
    """
    Remove partitions and append data to existing cube.

    For details on ``data`` and ``metadata``, see :meth:`build_cube`.

    Only datasets in `ktk_cube_dataset_ids` will be affected.

    Parameters
    ----------
    data: dask.Bag
        Bag containing dataframes
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: Callable[[], simplekv.KeyValueStore]
        Store to which the data should be written to.
    remove_conditions
        Conditions that select the partitions to remove. Must be a condition that only uses
        partition columns.
    ktk_cube_dataset_ids: Optional[Iterable[str]]
        Datasets that will be written, must be specified in advance.
    metadata: Optional[Dict[str, Dict[str, Any]]]
        Metadata for every dataset, optional. For every dataset, only given keys are updated/replaced. Deletion of
        metadata keys is not possible.

    Returns
    -------
    metadata_dict: dask.bag.Bag
        A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
        objects. The bag has a single partition with a single element.
    """
    return append_to_cube_from_bag_internal(
        data=data,
        cube=cube,
        store=store,
        remove_conditions=remove_conditions,
        ktk_cube_dataset_ids=ktk_cube_dataset_ids,
        metadata=metadata,
    )