示例#1
0
def _map_ktk_mps_to_groups(cube, datasets, label2gp):
    """
    Map Kartothek metapartitions to groups.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are processed by the regrouper.
    label2gp: Dict[str, Dict[str, Tuple[int, int]]]
        Maps "dataset ID -> (label -> (group ID, partition ID))".

    Returns
    -------
    groups: Dict[int, Dict[int, Dict[str, Tuple[kartothek.io_components.metapartition.MetaPartition, ...]]]]
        Maps "group ID -> (partition ID -> (dataset ID -> list of MetaPartitions))"
    """
    groups = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for ktk_cube_dataset_id, ds in datasets.items():
        label2gp_sub = label2gp[ktk_cube_dataset_id]
        for mp in dispatch_metapartitions_from_factory(
                dataset_factory=metadata_factory_from_dataset(ds), ):
            # FIXME: can this be simplified?
            if mp.label not in label2gp_sub:
                # filtered out by pre-condition
                continue
            for group_id, partition_id in label2gp_sub[mp.label]:
                groups[group_id][partition_id][ktk_cube_dataset_id].append(mp)

    return groups
示例#2
0
def delete_dataset(dataset_uuid=None, store=None, factory=None):
    """
    Parameters
    ----------
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        load_schema=False,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )

    # Remove possibly unreferenced files
    garbage_collect_dataset(factory=ds_factory)

    # Delete indices first since they do not affect dataset integrity
    delete_indices(dataset_factory=ds_factory)

    for metapartition in dispatch_metapartitions_from_factory(ds_factory):
        metapartition = cast(MetaPartition, metapartition)
        metapartition.delete_from_store(dataset_uuid=dataset_uuid, store=store)

    # delete common metadata after partitions
    delete_common_metadata(dataset_factory=ds_factory)

    # Delete the top level metadata file
    delete_top_level_metadata(dataset_factory=ds_factory)
示例#3
0
def delete_dataset__delayed(dataset_uuid=None, store=None, factory=None):
    """
    Parameters
    ----------
    """
    dataset_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_schema=False,
        load_dataset_metadata=False,
    )

    gc = garbage_collect_dataset__delayed(factory=dataset_factory)

    mps = dispatch_metapartitions_from_factory(dataset_factory)

    delayed_dataset_uuid = delayed(_delete_all_additional_metadata)(
        dataset_factory=dataset_factory)

    mps = map_delayed(
        mps,
        MetaPartition.delete_from_store,
        store=store,
        dataset_uuid=delayed_dataset_uuid,
    )

    return delayed(_delete_tl_metadata)(dataset_factory, mps, gc)
示例#4
0
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    columns=None,
    predicate_pushdown_to_io=True,
    categoricals=None,
    dates_as_object: bool = True,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`

    Parameters
    ----------

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=factory,
    )

    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory, predicates=predicates, dispatch_by=dispatch_by,
    )

    for mp in mps:
        if dispatch_by is not None:
            mp = MetaPartition.concat_metapartitions(
                [
                    mp_inner.load_dataframes(
                        store=store,
                        columns=columns,
                        categoricals=categoricals,
                        predicate_pushdown_to_io=predicate_pushdown_to_io,
                        predicates=predicates,
                    )
                    for mp_inner in mp
                ]
            )
        else:
            mp = cast(MetaPartition, mp)
            mp = mp.load_dataframes(
                store=store,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp
示例#5
0
文件: bag.py 项目: DD5HT/kartothek
def build_dataset_indices__bag(
    store, dataset_uuid, columns, partition_size=None, factory=None
):
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------
    partition_size: Optional[int]
        Dask bag partition size. Use a larger numbers to decrease scheduler load and overhead, use smaller numbers for a
        fine-grained scheduling and better resilience against worker errors.

    Returns
    -------
    A dask.delayed computation object.
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    cols_to_load = {
        table: set(columns) & set(meta.names)
        for table, meta in ds_factory.table_meta.items()
    }
    cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols}

    mps = dispatch_metapartitions_from_factory(ds_factory)

    return (
        db.from_sequence(seq=mps, partition_size=partition_size)
        .map(
            MetaPartition.load_dataframes,
            store=ds_factory.store_factory,
            tables=list(cols_to_load.keys()),
            columns=cols_to_load,
        )
        .map(MetaPartition.build_indices, columns=columns)
        .map(MetaPartition.remove_dataframes)
        .reduction(list, list, split_every=False, out_type=db.Bag)
        .flatten()
        .map_partitions(list)
        .map_partitions(
            update_indices_from_partitions, dataset_metadata_factory=ds_factory
        )
    )
示例#6
0
def build_dataset_indices__bag(
    store: Optional[StoreInput],
    dataset_uuid: Optional[str],
    columns: Sequence[str],
    partition_size: Optional[int] = None,
    factory: Optional[DatasetFactory] = None,
) -> Delayed:
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------

    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    cols_to_load = {
        table: set(columns) & set(meta.names)
        for table, meta in ds_factory.table_meta.items()
    }
    cols_to_load = {table: cols for table, cols in cols_to_load.items() if cols}

    mps = dispatch_metapartitions_from_factory(ds_factory)

    return (
        db.from_sequence(seq=mps, partition_size=partition_size)
        .map(
            MetaPartition.load_dataframes,
            store=ds_factory.store_factory,
            tables=list(cols_to_load.keys()),
            columns=cols_to_load,
        )
        .map(MetaPartition.build_indices, columns=columns)
        .map(MetaPartition.remove_dataframes)
        .reduction(list, list, split_every=False, out_type=db.Bag)
        .flatten()
        .map_partitions(list)
        .map_partitions(
            update_indices_from_partitions, dataset_metadata_factory=ds_factory
        )
    )
示例#7
0
文件: stats.py 项目: xhochy/kartothek
def get_metapartitions_for_stats(datasets):
    """
    Get all metapartitions that need to be scanned to gather cube stats.

    Parameters
    ----------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are present.

    Returns
    -------
    metapartitions: Tuple[Tuple[str, Tuple[kartothek.io_components.metapartition.MetaPartition, ...]], ...]
        Pre-aligned metapartitions (by primary index / physical partitions) and the ktk_cube dataset ID belonging to them.
    """
    all_metapartitions = []
    for ktk_cube_dataset_id, ds in datasets.items():
        dataset_factory = metadata_factory_from_dataset(ds)
        for mp in dispatch_metapartitions_from_factory(
                dataset_factory=dataset_factory,
                dispatch_by=dataset_factory.partition_keys):
            all_metapartitions.append((ktk_cube_dataset_id, mp))
    return all_metapartitions
示例#8
0
def build_dataset_indices(store, dataset_uuid, columns, factory=None):
    """
    Function which builds a :class:`~kartothek.core.index.ExplicitSecondaryIndex`.

    This function loads the dataset, computes the requested indices and writes
    the indices to the dataset. The dataset partitions itself are not mutated.

    Parameters
    ----------
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    cols_to_load = {
        table: set(columns) & set(meta.names)
        for table, meta in ds_factory.table_meta.items()
    }
    cols_to_load = {
        table: cols
        for table, cols in cols_to_load.items() if cols
    }

    new_partitions = []
    for mp in dispatch_metapartitions_from_factory(ds_factory):
        mp = mp.load_dataframes(
            store=ds_factory.store,
            tables=list(cols_to_load.keys()),
            columns=cols_to_load,
        )
        mp = mp.build_indices(columns=columns)
        mp = mp.remove_dataframes()  # Remove dataframe from memory
        new_partitions.append(mp)

    return update_indices_from_partitions(new_partitions,
                                          dataset_metadata_factory=ds_factory)
示例#9
0
def read_dataset_as_metapartitions_bag(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    partition_size=None,
    dispatch_metadata=True,
):
    """
    Retrieve dataset as `dask.bag.Bag` of `MetaPartition` objects.

    Parameters
    ----------

    Returns
    -------
    dask.bag.Bag:
        A dask.bag object containing the metapartions.
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        dispatch_metadata=dispatch_metadata,
    )
    mps = db.from_sequence(mps, partition_size=partition_size)

    if concat_partitions_on_primary_index or dispatch_by is not None:
        mps = mps.map(
            _load_and_concat_metapartitions_inner,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = mps.map(
            MetaPartition.load_dataframes,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals)

    if categoricals_from_index:
        func_dict = defaultdict(_identity)
        func_dict.update({
            table: partial(_cast_categorical_to_index_cat, categories=cats)
            for table, cats in categoricals_from_index.items()
        })
        mps = mps.map(MetaPartition.apply, func_dict, type_safe=True)
    return mps
示例#10
0
文件: bag.py 项目: DD5HT/kartothek
def read_dataset_as_metapartitions_bag(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    partition_size=None,
):
    """
    Retrieve dataset as `dask.bag` of `MetaPartition` objects.

    Parameters
    ----------

    Returns
    -------
    A dask.bag object containing the metapartions.
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )
    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
    )
    mps = db.from_sequence(mps, partition_size=partition_size)

    if concat_partitions_on_primary_index or dispatch_by:
        mps = mps.map(
            _load_and_concat_metapartitions_inner,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = mps.map(
            MetaPartition.load_dataframes,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals
    )

    if categoricals_from_index:
        func_dict = defaultdict(_identity)
        func_dict.update(
            {
                table: partial(_cast_categorical_to_index_cat, categories=cats)
                for table, cats in categoricals_from_index.items()
            }
        )
        mps = mps.map(MetaPartition.apply, func_dict, type_safe=True)
    return mps
示例#11
0
def read_dataset_as_delayed_metapartitions(
    dataset_uuid=None,
    store=None,
    columns=None,
    predicate_pushdown_to_io=True,
    categoricals: Optional[Sequence[str]] = None,
    dates_as_object: bool = True,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """
    A collection of dask.delayed objects to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io.dask.read_dataset_as_delayed`

    Parameters
    ----------

    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=factory,
    )

    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by,
    )

    if dispatch_by is not None:
        mps = _load_and_concat_metapartitions(
            mps,
            store=store,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = map_delayed(
            MetaPartition.load_dataframes,
            mps,
            store=store,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals
    )

    if categoricals_from_index:

        mps = map_delayed(
            partial(  # type: ignore
                MetaPartition.apply,
                func=partial(  # type: ignore
                    _cast_categorical_to_index_cat, categories=categoricals_from_index
                ),
                type_safe=True,
            ),
            mps,
        )

    return list(mps)
示例#12
0
def read_dataset_as_delayed_metapartitions(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    dispatch_metadata=True,
):
    """
    A collection of dask.delayed objects to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io.dask.read_dataset_as_delayed`

    Parameters
    ----------

    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        dispatch_metadata=dispatch_metadata,
    )

    if concat_partitions_on_primary_index or dispatch_by is not None:
        mps = _load_and_concat_metapartitions(
            mps,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = map_delayed(
            MetaPartition.load_dataframes,
            mps,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals)

    if categoricals_from_index:
        func_dict = defaultdict(_identity)
        func_dict.update({
            table: partial(_cast_categorical_to_index_cat, categories=cats)
            for table, cats in categoricals_from_index.items()
        })
        mps = map_delayed(
            partial(MetaPartition.apply, func=func_dict, type_safe=True), mps)

    return list(mps)
示例#13
0
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`
    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )
    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
    )

    for mp in mps:
        if concat_partitions_on_primary_index:
            mp = MetaPartition.concat_metapartitions([
                mp_inner.load_dataframes(
                    store=store,
                    tables=tables,
                    columns=columns,
                    categoricals=categoricals,
                    predicate_pushdown_to_io=predicate_pushdown_to_io,
                    predicates=predicates,
                ) for mp_inner in mp
            ])
        else:
            mp = mp.load_dataframes(
                store=store,
                tables=tables,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp
示例#14
0
def read_dataset_as_metapartitions__iterator(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
    dispatch_metadata=True,
):
    """

    A Python iterator to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io_components.read.read_dataset_as_dataframes__iterator`

    Parameters
    ----------

    """

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )

    if len(ds_factory.tables) > 1:
        warnings.warn(
            "Trying to read a dataset with multiple internal tables. This functionality will be removed in the next "
            "major release. If you require a multi tabled data format, we recommend to switch to the kartothek Cube "
            "functionality. "
            "https://kartothek.readthedocs.io/en/stable/guide/cube/kartothek_cubes.html",
            DeprecationWarning,
        )

    store = ds_factory.store
    mps = dispatch_metapartitions_from_factory(
        ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
        dispatch_metadata=dispatch_metadata,
    )

    for mp in mps:
        if concat_partitions_on_primary_index or dispatch_by is not None:
            mp = MetaPartition.concat_metapartitions(
                [
                    mp_inner.load_dataframes(
                        store=store,
                        tables=tables,
                        columns=columns,
                        categoricals=categoricals,
                        predicate_pushdown_to_io=predicate_pushdown_to_io,
                        predicates=predicates,
                    )
                    for mp_inner in mp
                ]
            )
        else:
            mp = cast(MetaPartition, mp)
            mp = mp.load_dataframes(
                store=store,
                tables=tables,
                columns=columns,
                categoricals=categoricals,
                predicate_pushdown_to_io=predicate_pushdown_to_io,
                dates_as_object=dates_as_object,
                predicates=predicates,
            )
        yield mp
示例#15
0
def read_dataset_as_delayed_metapartitions(
    dataset_uuid=None,
    store=None,
    tables=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    load_dataset_metadata=False,
    predicates=None,
    factory=None,
    dispatch_by=None,
):
    """
    A collection of dask.delayed objects to retrieve a dataset from store where each
    partition is loaded as a :class:`~kartothek.io_components.metapartition.MetaPartition`.

    .. seealso:

        :func:`~kartothek.io.dask.read_dataset_as_delayed`

    Parameters
    ----------

    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=load_dataset_metadata,
    )
    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        label_filter=label_filter,
        predicates=predicates,
        dispatch_by=dispatch_by,
    )

    if concat_partitions_on_primary_index or dispatch_by:
        mps = _load_and_concat_metapartitions(
            mps,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mps = map_delayed(
            mps,
            MetaPartition.load_dataframes,
            store=store,
            tables=tables,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals)

    if categoricals_from_index:
        func_dict = defaultdict(_identity)
        func_dict.update({
            table: partial(_cast_categorical_to_index_cat, categories=cats)
            for table, cats in categoricals_from_index.items()
        })
        mps = map_delayed(mps, MetaPartition.apply, func_dict, type_safe=True)

    return mps
示例#16
0
def collect_dataset_metadata(
    store: Optional[StoreInput] = None,
    dataset_uuid: Optional[str] = None,
    table_name: str = SINGLE_TABLE,
    predicates: Optional[PredicatesType] = None,
    frac: float = 1.0,
    factory: Optional[DatasetFactory] = None,
) -> dd.DataFrame:
    """
    Collect parquet metadata of the dataset. The `frac` parameter can be used to select a subset of the data.

    .. warning::
      If the size of the partitions is not evenly distributed, e.g. some partitions might be larger than others,
      the metadata returned is not a good approximation for the whole dataset metadata.
    .. warning::
      Using the `frac` parameter is not encouraged for a small number of total partitions.


    Parameters
    ----------
    predicates
      Kartothek predicates to apply filters on the data for which to gather statistics

      .. warning::
          Filtering will only be applied for predicates on indices.
          The evaluation of the predicates therefore will therefore only return an approximate result.

    frac
      Fraction of the total number of partitions to use for gathering statistics. `frac == 1.0` will use all partitions.

    Returns
    -------
    dask.dataframe.DataFrame:
        A dask.DataFrame containing the following information about dataset statistics:
        * `partition_label`: File name of the parquet file, unique to each physical partition.
        * `row_group_id`: Index of the row groups within one parquet file.
        * `row_group_compressed_size`: Byte size of the data within one row group.
        * `row_group_uncompressed_size`: Byte size (uncompressed) of the data within one row group.
        * `number_rows_total`: Total number of rows in one parquet file.
        * `number_row_groups`: Number of row groups in one parquet file.
        * `serialized_size`: Serialized size of the parquet file.
        * `number_rows_per_row_group`: Number of rows per row group.

    Raises
    ------
    ValueError
      If no metadata could be retrieved, raise an error.

    """
    if not 0.0 < frac <= 1.0:
        raise ValueError(
            f"Invalid value for parameter `frac`: {frac}."
            "Please make sure to provide a value larger than 0.0 and smaller than or equal to 1.0 ."
        )
    dataset_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=store,
        factory=factory,
        load_dataset_metadata=False,
    )

    mps = list(
        dispatch_metapartitions_from_factory(dataset_factory, predicates=predicates)
    )
    if mps:
        random.shuffle(mps)
        # ensure that even with sampling at least one metapartition is returned
        cutoff_index = max(1, int(len(mps) * frac))
        mps = mps[:cutoff_index]
        ddf = dd.from_delayed(
            [
                dask.delayed(MetaPartition.get_parquet_metadata)(
                    mp, store=dataset_factory.store_factory, table_name=table_name
                )
                for mp in mps
            ],
            meta=_METADATA_SCHEMA,
        )
    else:
        df = pd.DataFrame(columns=_METADATA_SCHEMA.keys())
        df = df.astype(_METADATA_SCHEMA)
        ddf = dd.from_pandas(df, npartitions=1)

    return ddf
示例#17
0
def read_dataset_as_metapartitions_bag(
    dataset_uuid=None,
    store=None,
    columns=None,
    predicate_pushdown_to_io=True,
    categoricals=None,
    dates_as_object: bool = True,
    predicates=None,
    factory=None,
    dispatch_by=None,
    partition_size=None,
):
    """
    Retrieve dataset as `dask.bag.Bag` of `MetaPartition` objects.

    Parameters
    ----------

    Returns
    -------
    dask.bag.Bag:
        A dask.bag object containing the metapartions.
    """
    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid, store=store, factory=factory,
    )

    store = ds_factory.store_factory
    mps = dispatch_metapartitions_from_factory(
        dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by,
    )
    mp_bag = db.from_sequence(mps, partition_size=partition_size)

    if dispatch_by is not None:
        mp_bag = mp_bag.map(
            _load_and_concat_metapartitions_inner,
            store=store,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )
    else:
        mp_bag = mp_bag.map(
            MetaPartition.load_dataframes,
            store=store,
            columns=columns,
            categoricals=categoricals,
            predicate_pushdown_to_io=predicate_pushdown_to_io,
            dates_as_object=dates_as_object,
            predicates=predicates,
        )

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, categoricals
    )

    if categoricals_from_index:

        mp_bag = mp_bag.map(
            MetaPartition.apply,
            func=partial(
                _cast_categorical_to_index_cat, categories=categoricals_from_index
            ),
            type_safe=True,
        )
    return mp_bag