Exemplo n.º 1
0
def store_dataset_from_ddf(
    ddf: dd.DataFrame,
    store: StoreInput,
    dataset_uuid: str,
    table: str = SINGLE_TABLE,
    secondary_indices: Optional[List[str]] = None,
    shuffle: bool = False,
    repartition_ratio: Optional[SupportsFloat] = None,
    num_buckets: int = 1,
    sort_partitions_by: Optional[Union[List[str], str]] = None,
    delete_scope: Optional[Iterable[Mapping[str, str]]] = None,
    metadata: Optional[Mapping] = None,
    df_serializer: Optional[DataFrameSerializer] = None,
    metadata_merger: Optional[Callable] = None,
    metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    bucket_by: Optional[Union[List[str], str]] = None,
    overwrite: bool = False,
):
    """
    Store a dataset from a dask.dataframe.
    """
    partition_on = normalize_arg("partition_on", partition_on)
    secondary_indices = normalize_arg("secondary_indices", secondary_indices)
    sort_partitions_by = normalize_arg("sort_partitions_by", sort_partitions_by)
    bucket_by = normalize_arg("bucket_by", bucket_by)
    store = normalize_arg("store", store)
    delete_scope = dask.delayed(normalize_arg)("delete_scope", delete_scope)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=None, load_dataset_metadata=True
    )

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)
    mps = _write_dataframe_partitions(
        ddf=ddf,
        store=store,
        dataset_uuid=dataset_uuid,
        table=table,
        secondary_indices=secondary_indices,
        shuffle=shuffle,
        repartition_ratio=repartition_ratio,
        num_buckets=num_buckets,
        sort_partitions_by=sort_partitions_by,
        df_serializer=df_serializer,
        metadata_version=metadata_version,
        partition_on=partition_on,
        bucket_by=bucket_by,
    )
    return dask.delayed(store_dataset_from_partitions)(
        mps,
        store=ds_factory.store_factory if ds_factory else store,
        dataset_uuid=ds_factory.dataset_uuid if ds_factory else dataset_uuid,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
    )
Exemplo n.º 2
0
def store_dataframes_as_dataset__iter(
    df_generator,
    store,
    dataset_uuid=None,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
    secondary_indices=None,
):
    """
    Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files).

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    dataset: kartothek.core.dataset.DatasetMetadata
        The stored dataset.

    """

    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(df,
                                          metadata_version=metadata_version)

        if partition_on:
            mp = mp.partition_on(partition_on)

        if secondary_indices:
            mp = mp.build_indices(secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(store=store,
                                 dataset_uuid=dataset_uuid,
                                 df_serializer=df_serializer)

        # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions
        new_partitions.append(mp)

    # Store metadata and return `kartothek.DatasetMetadata` object
    return store_dataset_from_partitions(
        partition_list=new_partitions,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Exemplo n.º 3
0
def store_delayed_as_dataset(
    delayed_tasks: List[Delayed],
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    table_name: str = SINGLE_TABLE,
    secondary_indices=None,
) -> Delayed:
    """
    Transform and store a list of dictionaries containing
    dataframes to a kartothek dataset in store.

    Parameters
    ----------
    """
    store = lazy_store(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    input_to_mps = partial(
        parse_input_to_metapartition,
        metadata_version=metadata_version,
        table_name=table_name,
    )
    mps = map_delayed(input_to_mps, delayed_tasks)

    if partition_on:
        mps = map_delayed(MetaPartition.partition_on, mps, partition_on=partition_on)

    if secondary_indices:
        mps = map_delayed(MetaPartition.build_indices, mps, columns=secondary_indices)

    mps = map_delayed(
        MetaPartition.store_dataframes,
        mps,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    return delayed(store_dataset_from_partitions)(
        mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )
Exemplo n.º 4
0
def create_empty_dataset_header(
    store,
    dataset_uuid,
    table_meta,
    partition_on=None,
    metadata=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Create an dataset header without any partitions. This may be used in combination
    with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets.

    .. note::

        The created dataset will **always** have explicit_partition==False

    .. warning::

        This function should only be used in very rare occasions. Usually you're better off using
        full end-to-end pipelines.

    Parameters
    ----------
    """
    store = lazy_store(store)()
    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    for table, schema in table_meta.items():
        table_meta[table] = make_meta(schema,
                                      origin=table,
                                      partition_keys=partition_on)
        store_schema_metadata(
            schema=table_meta[table],
            dataset_uuid=dataset_uuid,
            store=store,
            table=table,
        )
    dataset_builder = DatasetMetadataBuilder(
        uuid=dataset_uuid,
        metadata_version=metadata_version,
        partition_keys=partition_on,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    if metadata:
        for key, value in metadata.items():
            dataset_builder.add_metadata(key, value)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    return dataset_builder.to_dataset()
Exemplo n.º 5
0
def store_dataframes_as_dataset(
    store,
    dataset_uuid,
    dfs,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files).

    Useful for very small datasets where all data fits into memory.

    Parameters
    ----------
    dfs : dict of pd.DataFrame or pd.DataFrame
        The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table.

    Returns
    -------
    The stored dataset

    """
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if isinstance(dfs, dict):
        dfs = {"data": [(table, df) for table, df in dfs.items()]}

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    mp = parse_input_to_metapartition(dfs, metadata_version)

    if partition_on:
        mp = MetaPartition.partition_on(mp, partition_on)

    mps = mp.store_dataframes(store=store,
                              dataset_uuid=dataset_uuid,
                              df_serializer=df_serializer)

    return store_dataset_from_partitions(
        partition_list=mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )
Exemplo n.º 6
0
def _store_bag_as_dataset_parallel(
    bag: db.Bag,
    store: KeyValueStore,
    cube: Cube,
    ktk_cube_dataset_ids: Iterable[str],
    metadata: Optional[Dict[str, Dict[str, Any]]],
    existing_datasets,
    overwrite: bool = False,
    update: bool = False,
    delete_scopes=None,
    df_serializer: Optional[ParquetSerializer] = None,
) -> db.Bag:
    """
    Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to
    store datasets in parallel (e.g. from a dict).

    `delete_scope` is a dictionary mapping the kartothek dataset id to the `delete_scope` of the dataset
    (see `update_dataset_from_partitions` for the definition of the single dataset `delete_scope`).
    """
    if (not update) and (not overwrite):
        for ktk_cube_dataset_id in ktk_cube_dataset_ids:
            raise_if_dataset_exists(
                dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
                store=store)

    mps = bag.map(_multiplex_parse_input_to_metapartition)

    # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not
    # required here anymore

    mps = mps.map(_multiplex_store,
                  store=store,
                  cube=cube,
                  df_serializer=df_serializer)

    aggregate = partial(
        _multiplex_store_dataset_from_partitions_flat,
        cube=cube,
        existing_datasets=existing_datasets,
        metadata=metadata,
        store=store,
        update=update,
        delete_scopes=delete_scopes or {},
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False,
                         out_type=db.Bag)
Exemplo n.º 7
0
def _store_bag_as_dataset_parallel(
    bag,
    store,
    cube,
    ktk_cube_dataset_ids,
    metadata,
    existing_datasets,
    overwrite=False,
    update=False,
):
    """
    Vendored, simplified and modified version of kartotheks ``store_bag_as_dataset`` which cannot be easily used to
    store datasets in parallel (e.g. from a dict).
    """
    if (not update) and (not overwrite):
        for ktk_cube_dataset_id in ktk_cube_dataset_ids:
            raise_if_dataset_exists(
                dataset_uuid=cube.ktk_dataset_uuid(ktk_cube_dataset_id),
                store=store)

    mps = bag.map(_multiplex_parse_input_to_metapartition)

    # prepare_data_for_ktk already runs `MetaPartition.partition_on` and `MetaPartition.build_indices`, so this is not
    # required here anymore

    mps = mps.map(_multiplex_store, store=store, cube=cube)

    aggregate = partial(
        _multiplex_store_dataset_from_partitions_flat,
        cube=cube,
        existing_datasets=existing_datasets,
        metadata=metadata,
        store=store,
        update=update,
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False,
                         out_type=db.Bag)
Exemplo n.º 8
0
def store_delayed_as_dataset(
    delayed_tasks,
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    secondary_indices=None,
):
    """
    Transform and store a list of dictionaries containing
    dataframes to a kartothek dataset in store.

    Parameters
    ----------
    delayed_tasks: list of dask.delayed
        Every delayed object represents a partition and should be accepted by
        :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition`


    Returns
    -------
    A dask.delayed dataset object.
    """
    _check_callable(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    input_to_mps = partial(parse_input_to_metapartition,
                           metadata_version=metadata_version)
    mps = map_delayed(delayed_tasks, input_to_mps)

    if partition_on:
        mps = map_delayed(mps,
                          MetaPartition.partition_on,
                          partition_on=partition_on)

    if secondary_indices:
        mps = map_delayed(mps,
                          MetaPartition.build_indices,
                          columns=secondary_indices)

    mps = map_delayed(
        mps,
        MetaPartition.store_dataframes,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    return delayed(store_dataset_from_partitions)(
        mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )
Exemplo n.º 9
0
def store_bag_as_dataset(
    bag,
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    secondary_indices=None,
):
    """
    Transform and store a dask.bag of dictionaries containing
    dataframes to a kartothek dataset in store.

    This is the dask.bag-equivalent of
    :func:`~kartothek.io.dask.delayed.store_delayed_as_dataset`. See there
    for more detailed documentation on the different possible input types.

    Parameters
    ----------
    bag: dask.bag.Bag
        A dask bag containing dictionaries of dataframes or dataframes.

    """
    store = lazy_store(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    input_to_mps = partial(parse_input_to_metapartition,
                           metadata_version=metadata_version)
    mps = bag.map(input_to_mps)

    if partition_on:
        mps = mps.map(MetaPartition.partition_on, partition_on=partition_on)

    if secondary_indices:
        mps = mps.map(MetaPartition.build_indices, columns=secondary_indices)

    mps = mps.map(
        MetaPartition.store_dataframes,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    aggregate = partial(
        _store_dataset_from_partitions_flat,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False)
Exemplo n.º 10
0
def test_raise_if_dataset_exists(store_factory, dataset_function):
    raise_if_dataset_exists(dataset_uuid="ThisDoesNotExist",
                            store=store_factory)
    with pytest.raises(RuntimeError):
        raise_if_dataset_exists(dataset_uuid=dataset_function.uuid,
                                store=store_factory)
Exemplo n.º 11
0
def store_dataset_from_ddf(
    ddf: dd.DataFrame,
    store: StoreInput,
    dataset_uuid: str,
    table: str = SINGLE_TABLE,
    secondary_indices: Optional[List[str]] = None,
    shuffle: bool = False,
    repartition_ratio: Optional[SupportsFloat] = None,
    num_buckets: int = 1,
    sort_partitions_by: Optional[Union[List[str], str]] = None,
    metadata: Optional[Mapping] = None,
    df_serializer: Optional[DataFrameSerializer] = None,
    metadata_merger: Optional[Callable] = None,
    metadata_version: int = DEFAULT_METADATA_VERSION,
    partition_on: Optional[List[str]] = None,
    bucket_by: Optional[Union[List[str], str]] = None,
    overwrite: bool = False,
):
    """
    Store a dataset from a dask.dataframe.
    """
    # normalization done by normalize_args but mypy doesn't recognize this
    sort_partitions_by = cast(List[str], sort_partitions_by)
    secondary_indices = cast(List[str], secondary_indices)
    bucket_by = cast(List[str], bucket_by)
    partition_on = cast(List[str], partition_on)

    if table is None:
        raise TypeError("The parameter `table` is not optional.")

    ds_factory = _ensure_factory(dataset_uuid=dataset_uuid,
                                 store=store,
                                 factory=None)

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)
    mp_ser = _write_dataframe_partitions(
        ddf=ddf,
        store=ds_factory.store_factory,
        dataset_uuid=dataset_uuid,
        table=table,
        secondary_indices=secondary_indices,
        shuffle=shuffle,
        repartition_ratio=repartition_ratio,
        num_buckets=num_buckets,
        sort_partitions_by=sort_partitions_by,
        df_serializer=df_serializer,
        metadata_version=metadata_version,
        partition_on=partition_on,
        bucket_by=bucket_by,
    )
    return mp_ser.reduction(
        chunk=_id,
        aggregate=_commit_store_from_reduction,
        split_every=False,
        token="commit-dataset",
        meta=object,
        aggregate_kwargs={
            "store": ds_factory.store_factory,
            "dataset_uuid": ds_factory.dataset_uuid,
            "dataset_metadata": metadata,
            "metadata_merger": metadata_merger,
        },
    )