示例#1
0
def store_delayed_as_dataset(
    delayed_tasks: List[Delayed],
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    table_name: str = SINGLE_TABLE,
    secondary_indices=None,
) -> Delayed:
    """
    Transform and store a list of dictionaries containing
    dataframes to a kartothek dataset in store.

    Parameters
    ----------
    """
    store = lazy_store(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    input_to_mps = partial(
        parse_input_to_metapartition,
        metadata_version=metadata_version,
        table_name=table_name,
    )
    mps = map_delayed(input_to_mps, delayed_tasks)

    if partition_on:
        mps = map_delayed(MetaPartition.partition_on, mps, partition_on=partition_on)

    if secondary_indices:
        mps = map_delayed(MetaPartition.build_indices, mps, columns=secondary_indices)

    mps = map_delayed(
        MetaPartition.store_dataframes,
        mps,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    return delayed(store_dataset_from_partitions)(
        mps,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )
示例#2
0
def store_bag_as_dataset(
    bag,
    store,
    dataset_uuid=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=naming.DEFAULT_METADATA_VERSION,
    partition_on=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
    secondary_indices=None,
):
    """
    Transform and store a dask.bag of dictionaries containing
    dataframes to a kartothek dataset in store.

    This is the dask.bag-equivalent of
    :func:`~kartothek.io.dask.delayed.store_delayed_as_dataset`. See there
    for more detailed documentation on the different possible input types.

    Parameters
    ----------
    bag: dask.bag.Bag
        A dask bag containing dictionaries of dataframes or dataframes.

    """
    store = lazy_store(store)
    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    input_to_mps = partial(parse_input_to_metapartition,
                           metadata_version=metadata_version)
    mps = bag.map(input_to_mps)

    if partition_on:
        mps = mps.map(MetaPartition.partition_on, partition_on=partition_on)

    if secondary_indices:
        mps = mps.map(MetaPartition.build_indices, columns=secondary_indices)

    mps = mps.map(
        MetaPartition.store_dataframes,
        store=store,
        df_serializer=df_serializer,
        dataset_uuid=dataset_uuid,
    )

    aggregate = partial(
        _store_dataset_from_partitions_flat,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        metadata_storage_format=metadata_storage_format,
    )

    return mps.reduction(perpartition=list,
                         aggregate=aggregate,
                         split_every=False)
示例#3
0
文件: iter.py 项目: lr4d/kartothek
def store_dataframes_as_dataset__iter(
    df_generator,
    store,
    dataset_uuid=None,
    metadata=None,
    partition_on=None,
    df_serializer=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
    secondary_indices=None,
):
    """
    Store `pd.DataFrame` s iteratively as a partitioned dataset with multiple tables (files).

    Useful for datasets which do not fit into memory.

    Parameters
    ----------

    Returns
    -------
    dataset: kartothek.core.dataset.DatasetMetadata
        The stored dataset.

    """

    if dataset_uuid is None:
        dataset_uuid = gen_uuid()

    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    raise_if_indices_overlap(partition_on, secondary_indices)

    new_partitions = []
    for df in df_generator:
        mp = parse_input_to_metapartition(df, metadata_version=metadata_version)

        if partition_on:
            mp = mp.partition_on(partition_on)

        if secondary_indices:
            mp = mp.build_indices(secondary_indices)

        # Store dataframe, thereby clearing up the dataframe from the `mp` metapartition
        mp = mp.store_dataframes(
            store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer
        )

        # Add `kartothek.io_components.metapartition.MetaPartition` object to list to track partitions
        new_partitions.append(mp)

    # Store metadata and return `kartothek.DatasetMetadata` object
    return store_dataset_from_partitions(
        partition_list=new_partitions,
        dataset_uuid=dataset_uuid,
        store=store,
        dataset_metadata=metadata,
        metadata_storage_format=metadata_storage_format,
    )