예제 #1
0
파일: read.py 프로젝트: seanahmad/kartothek
def dispatch_metapartitions(
    dataset_uuid,
    store,
    load_dataset_metadata=True,
    keep_indices=True,
    keep_table_meta=True,
    label_filter=None,
    concat_partitions_on_primary_index=False,
    predicates=None,
    dispatch_by=None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    dataset_factory = DatasetFactory(
        dataset_uuid=dataset_uuid,
        store_factory=_make_callable(store),
        load_schema=True,
        load_all_indices=False,
        load_dataset_metadata=load_dataset_metadata,
    )

    return dispatch_metapartitions_from_factory(
        dataset_factory=dataset_factory,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
    )
예제 #2
0
def delete_dataset(dataset_uuid=None, store=None, factory=None):
    """
    Parameters
    ----------
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        load_schema=False,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )

    # Remove possibly unreferenced files
    garbage_collect_dataset(factory=ds_factory)

    # Delete indices first since they do not affect dataset integrity
    delete_indices(dataset_factory=ds_factory)

    for metapartition in dispatch_metapartitions_from_factory(ds_factory):
        metapartition = cast(MetaPartition, metapartition)
        metapartition.delete_from_store(dataset_uuid=dataset_uuid, store=store)

    # delete common metadata after partitions
    delete_common_metadata(dataset_factory=ds_factory)

    # Delete the top level metadata file
    delete_top_level_metadata(dataset_factory=ds_factory)
예제 #3
0
def read_dataset_as_dataframes(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
):
    """
    Read a dataset as a list of dataframes.

    Every element of the list corresponds to a physical partition.

    Parameters
    ----------

    Returns
    -------
    List[pandas.DataFrame]
        Returns a list of pandas.DataFrame. One element per partition

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_table

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> dfs = read_dataset_as_dataframes('dataset_uuid', store, 'core')

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=True,
    )

    mps = read_dataset_as_metapartitions(
        tables=tables,
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
    )
    return [mp.data for mp in mps]
예제 #4
0
def dispatch_metapartitions(
    dataset_uuid: str,
    store: Union[KeyValueStore, Callable[[], KeyValueStore]],
    load_dataset_metadata: bool = True,
    keep_indices: bool = True,
    keep_table_meta: bool = True,
    label_filter: Optional[Callable] = None,
    concat_partitions_on_primary_index: bool = False,
    predicates: PredicatesType = None,
    dispatch_by: Optional[List[str]] = None,
) -> Union[Iterator[MetaPartition], Iterator[List[MetaPartition]]]:
    dataset_factory = DatasetFactory(
        dataset_uuid=dataset_uuid,
        store_factory=cast(Callable[[], KeyValueStore], _make_callable(store)),
        load_schema=True,
        load_all_indices=False,
        load_dataset_metadata=load_dataset_metadata,
    )

    return dispatch_metapartitions_from_factory(
        dataset_factory=dataset_factory,
        store=None,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
    )
예제 #5
0
def create_empty_dataset_header(
    store,
    dataset_uuid,
    table_meta,
    partition_on=None,
    metadata=None,
    overwrite=False,
    metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT,
    metadata_version=DEFAULT_METADATA_VERSION,
):
    """
    Create an dataset header without any partitions. This may be used in combination
    with :func:`~kartothek.io.eager.write_single_partition` to create implicitly partitioned datasets.

    .. note::

        The created dataset will **always** have explicit_partition==False

    .. warning::

        This function should only be used in very rare occasions. Usually you're better off using
        full end-to-end pipelines.

    Parameters
    ----------
    """
    store = _make_callable(store)()
    if not overwrite:
        raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store)

    for table, schema in table_meta.items():
        table_meta[table] = make_meta(schema,
                                      origin=table,
                                      partition_keys=partition_on)
        store_schema_metadata(
            schema=table_meta[table],
            dataset_uuid=dataset_uuid,
            store=store,
            table=table,
        )
    dataset_builder = DatasetMetadataBuilder(
        uuid=dataset_uuid,
        metadata_version=metadata_version,
        partition_keys=partition_on,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    if metadata:
        for key, value in metadata.items():
            dataset_builder.add_metadata(key, value)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    return dataset_builder.to_dataset()
예제 #6
0
def write_single_partition(
    store=None,
    dataset_uuid=None,
    data=None,
    metadata=None,
    df_serializer=None,
    overwrite=False,
    metadata_merger=None,
    metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
    secondary_indices=None,
):
    """
    Write the parquet file(s) for a single partition. This will **not** update the dataset header and can therefore
    be used for highly concurrent dataset writes.

    For datasets with explicit partitions, the dataset header can be updated by calling
    :func:`kartothek.io.eager.commit_dataset` with the output of this function.

    .. note::

        It is highly recommended to use the full pipelines whenever possible. This functionality should be
        used with caution and should only be necessary in cases where traditional pipeline scheduling is not an
        option.

    .. note::

        This function requires an existing dataset metadata file and the schemas for the tables to be present.
        Either you have ensured that the dataset always exists though some other means or use
        :func:`create_empty_dataset_header` at the start of your computation to ensure the basic dataset
        metadata is there.

    Parameters
    ----------
    data: Dict
        The input is defined according to :func:`~kartothek.io_components.metapartition.parse_input_to_metapartition`

    Returns
    -------
    An empty :class:`~kartothek.io_components.metapartition.MetaPartition` referencing the new files
    """
    if metadata is not None:
        warnings.warn(
            "The keyword `metadata` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if overwrite is not False:
        warnings.warn(
            "The keyword `overwrite` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if metadata_merger is not None:
        warnings.warn(
            "The keyword `metadata_merger` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if data is None:
        raise TypeError("The parameter `data` is not optional")
    _, ds_metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        ds_factory=factory,
        default_metadata_version=metadata_version,
        partition_on=partition_on,
    )

    mp = parse_input_to_metapartition(obj=data,
                                      metadata_version=ds_metadata_version)
    if partition_on:
        mp = mp.partition_on(partition_on)

    if secondary_indices:
        mp = mp.build_indices(columns=secondary_indices)

    mp = mp.validate_schema_compatible(dataset_uuid=dataset_uuid, store=store)

    mp = mp.store_dataframes(store=store,
                             dataset_uuid=dataset_uuid,
                             df_serializer=df_serializer)
    return mp
예제 #7
0
def commit_dataset(
    store=None,
    dataset_uuid=None,
    new_partitions=NoDefault(),
    output_dataset_uuid=None,
    delete_scope=None,
    metadata=None,
    df_serializer=None,
    metadata_merger=None,
    default_metadata_version=DEFAULT_METADATA_VERSION,
    partition_on=None,
    factory=None,
    secondary_indices=None,
):
    """
    Update an existing dataset with new, already written partitions. This should be used in combination with
    :func:`~kartothek.io.eager.write_single_partition` and :func:`~kartothek.io.eager.create_empty_dataset_header`.

    .. note::

        It is highly recommended to use the full pipelines whenever possible. This functionality should be
        used with caution and should only be necessary in cases where traditional pipeline scheduling is not an
        option.

    Example:

        .. code::

            import storefact
            import pandas as pd
            from functools import partial
            from kartothek.io.eager import write_single_partition
            form kartothek.io.eager.update import commit_dataset

            store = partial(storefact.get_store_from_url, url="hfs://my_store")

            new_data={
                "data": {
                    "table_1": pd.DataFrame({'column': [1, 2]}),
                    "table_1": pd.DataFrame({'other_column': ['a', 'b']}),
                }
            }
            # The partition writing can be done concurrently and distributed if wanted.
            # Only the information about what partitions have been written is required for the commit.
            new_partitions = [
                write_single_partition(
                    store=store,
                    dataset_uuid='dataset_uuid',
                    data=new_data
                )
            ]

            new_dataset = commit_dataset(
                store=store,
                dataset_uuid='dataset_uuid',
                new_partitions=new_partitions
            )

    Parameters
    ----------
    new_partitions: List[kartothek.io_components.metapartition.MetaPartition]
        Input partition to be committed.

    """
    if output_dataset_uuid is not None:
        warnings.warn(
            "The keyword `output_dataset_uuid` has no use and will be removed in the next major release ",
            DeprecationWarning,
        )

    if df_serializer is not None:
        warnings.warn(
            "The keyword `df_serializer` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if isinstance(new_partitions, NoDefault):
        raise TypeError("The parameter `new_partitions` is not optional")
    store = _make_callable(store)
    ds_factory, metadata_version, partition_on = validate_partition_keys(
        dataset_uuid=dataset_uuid,
        store=store,
        ds_factory=factory,
        default_metadata_version=default_metadata_version,
        partition_on=partition_on,
    )

    mps = parse_input_to_metapartition(new_partitions,
                                       metadata_version=metadata_version)

    if secondary_indices:
        mps = mps.build_indices(columns=secondary_indices)

    mps = [_maybe_infer_files_attribute(mp, dataset_uuid) for mp in mps]

    dmd = update_dataset_from_partitions(
        mps,
        store_factory=store,
        dataset_uuid=dataset_uuid,
        ds_factory=ds_factory,
        delete_scope=delete_scope,
        metadata=metadata,
        metadata_merger=metadata_merger,
    )
    return dmd
예제 #8
0
def read_table(
    dataset_uuid=None,
    store=None,
    table=SINGLE_TABLE,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
):
    """
    A utility function to load a single table with multiple partitions as a single dataframe in one go.
    Mostly useful for smaller tables or datasets where all partitions fit into memory.

    The order of partitions is not guaranteed to be stable in the resulting dataframe.

    Parameters
    ----------

    Returns
    -------
    pandas.DataFrame
        Returns a pandas.DataFrame holding the data of the requested columns

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_table

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> df = read_table(store, 'dataset_uuid', 'core')

    """
    if concat_partitions_on_primary_index is not False:
        warnings.warn(
            "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not isinstance(table, str):
        raise TypeError("Argument `table` needs to be a string")

    columns = _check_compatible_list(table, columns, "columns")
    categoricals = _check_compatible_list(table, categoricals, "categoricals")

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )
    partitions = read_dataset_as_dataframes(
        tables=[table],
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
    )

    empty_df = empty_dataframe_from_schema(
        schema=ds_factory.table_meta[table],
        columns=columns[table] if columns is not None else None,
    )
    dfs = [partition_data[table] for partition_data in partitions] + [empty_df]
    # require meta 4 otherwise, can't construct types/columns
    if categoricals:
        dfs = align_categories(dfs, categoricals[table])
    df = pd.concat(dfs, ignore_index=True, sort=False)

    # ensure column order
    if len(empty_df.columns) > 0:
        df = df.reindex(empty_df.columns, copy=False, axis=1)

    return df