Exemplo n.º 1
0
def _quick_concat_or_none(dfs, dimension_columns, partition_columns):
    dfs = list(dfs)
    if dfs:
        return quick_concat(
            dfs=dfs,
            dimension_columns=dimension_columns,
            partition_columns=partition_columns,
        )
    else:
        return None
Exemplo n.º 2
0
def query_cube(
    cube,
    store,
    conditions=None,
    datasets=None,
    dimension_columns=None,
    partition_by=None,
    payload_columns=None,
):
    """
    Query cube.

    .. note::
        In case of ``partition_by=None`` (default case), only a single partition is generated. If this one will be
        empty (e.g. due to the provided conditions), an empty list will be returned, and a single-element list
        otherwise.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    store: simplekv.KeyValueStore
        KV store that preserves the cube.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied, optional.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to query, must all be part of the cube. May be either the result of :meth:`discover_datasets`, a list
        of Ktk_cube dataset ID or ``None`` (in which case auto-discovery will be used).
    dimension_columns: Union[None, str, Iterable[str]]
        Dimension columns of the query, may result in projection. If not provided, dimension columns from cube
        specification will be used.
    partition_by: Union[None, str, Iterable[str]]
        By which column logical partitions should be formed. If not provided, a single partition will be generated.
    payload_columns: Union[None, str, Iterable[str]]
        Which columns apart from ``dimension_columns`` and ``partition_by`` should be returned.

    Returns
    -------
    dfs: List[pandas.DataFrame]
        List of non-empty DataFrames, order by ``partition_by``. Column of DataFrames is alphabetically ordered. Data
        types are provided on best effort (they are restored based on the preserved data, but may be different due to
        Pandas NULL-handling, e.g. integer columns may be floats).
    """
    intention, _empty, groups = plan_query(
        cube=cube,
        store=store,
        conditions=conditions,
        datasets=datasets,
        dimension_columns=dimension_columns,
        partition_by=partition_by,
        payload_columns=payload_columns,
    )
    dfs = [load_group(group=g, store=store, cube=cube) for g in groups]
    dfs = [df for df in dfs if not df.empty]
    if not intention.partition_by and (len(dfs) > 0):
        dfs = [
            quick_concat(
                dfs=dfs,
                dimension_columns=intention.dimension_columns,
                partition_columns=cube.partition_columns,
            )
        ]
    return dfs