Пример #1
0
def read_table(
    dataset_uuid: Optional[str] = None,
    store=None,
    table: Optional[str] = SINGLE_TABLE,
    columns: Dict[str, List[str]] = None,
    concat_partitions_on_primary_index: bool = False,
    predicate_pushdown_to_io: bool = True,
    categoricals: Dict[str, List[str]] = None,
    label_filter: Callable = None,
    dates_as_object: bool = False,
    predicates: Optional[List[List[Tuple[str, str, Any]]]] = None,
    factory: Optional[DatasetFactory] = None,
) -> pd.DataFrame:
    """
    A utility function to load a single table with multiple partitions as a single dataframe in one go.
    Mostly useful for smaller tables or datasets where all partitions fit into memory.

    The order of partitions is not guaranteed to be stable in the resulting dataframe.

    Parameters
    ----------

    Returns
    -------
    pandas.DataFrame
        Returns a pandas.DataFrame holding the data of the requested columns

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_table

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> df = read_table(store, 'dataset_uuid', 'core')

    """
    if concat_partitions_on_primary_index is not False:
        warnings.warn(
            "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not isinstance(table, str):
        raise TypeError("Argument `table` needs to be a string")

    columns = _check_compatible_list(table, columns, "columns")
    categoricals = _check_compatible_list(table, categoricals, "categoricals")

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )
    partitions = read_dataset_as_dataframes(
        tables=[table],
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
    )

    empty_df = empty_dataframe_from_schema(
        schema=ds_factory.table_meta[table],
        columns=columns[table] if columns is not None else None,
    )
    dfs = [partition_data[table] for partition_data in partitions] + [empty_df]
    # require meta 4 otherwise, can't construct types/columns
    if categoricals:
        dfs = align_categories(dfs, categoricals[table])
    df = pd.concat(dfs, ignore_index=True, sort=False)

    # ensure column order
    if len(empty_df.columns) > 0:
        df = df.reindex(empty_df.columns, copy=False, axis=1)

    return df
Пример #2
0
def plan_query(
    conditions,
    cube,
    datasets,
    dimension_columns,
    partition_by,
    payload_columns,
    store,
):
    """
    Plan cube query execution.

    .. important::
        If the intention does not contain a partition-by, this partition by the cube partition columns to speed up the
        query on parallel backends. In that case, the backend must concat and check the resulting dataframes before
        passing it to the user.

    Parameters
    ----------
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied.
    cube: Cube
        Cube specification.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to query, must all be part of the cube.
    dimension_columns: Optional[Iterable[str]]
        Dimension columns of the query, may result in projection.
    partition_by: Optional[Iterable[str]]
        By which column logical partitions should be formed.
    payload_columns: Optional[Iterable[str]]
        Which columns apart from ``dimension_columns`` and ``partition_by`` should be returned.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store to query from.

    Returns
    -------
    intent: QueryIntention
        Query intention.
    empty_df: pandas.DataFrame
        Empty DataFrame representing the output types.
    groups: Tuple[QueryGroup]
        Tuple of query groups. May be empty.
    """
    if callable(store):
        store = store()

    if not isinstance(datasets, dict):
        datasets = discover_datasets(cube=cube,
                                     store=store,
                                     filter_ktk_cube_dataset_ids=datasets)
    else:
        datasets = check_datasets(datasets, cube)

    datasets = {
        ktk_cube_dataset_id: ds.load_partition_indices()
        for ktk_cube_dataset_id, ds in datasets.items()
    }
    indexed_columns = _get_indexed_columns(datasets)

    intention = determine_intention(
        cube=cube,
        datasets=datasets,
        dimension_columns=dimension_columns,
        partition_by=partition_by,
        conditions=conditions,
        payload_columns=payload_columns,
        indexed_columns=indexed_columns,
    )

    datasets = _load_required_explicit_indices(datasets, intention, store)

    restrictive_dataset_ids = _determine_restrictive_dataset_ids(
        cube=cube, datasets=datasets, intention=intention)

    load_columns = _dermine_load_columns(cube=cube,
                                         datasets=datasets,
                                         intention=intention)

    datasets = _filter_relevant_datasets(datasets=datasets,
                                         load_columns=load_columns)

    empty_df = {
        ktk_cube_dataset_id: _reduce_empty_dtype_sizes(
            empty_dataframe_from_schema(
                schema=ds.table_meta[SINGLE_TABLE],
                columns=sorted(
                    get_dataset_columns(ds)
                    & set(load_columns[ktk_cube_dataset_id])),
            ))
        for ktk_cube_dataset_id, ds in datasets.items()
    }

    empty_df_single = empty_df[cube.seed_dataset].copy()
    for k, df in empty_df.items():
        if k == cube.seed_dataset:
            continue
        if empty_df_single is None:
            empty_df_single = df.copy()
        else:
            empty_df_single = empty_df_single.merge(df)
    empty_df_single = empty_df_single[list(intention.output_columns)]

    groups = regroup(
        intention,
        cube=cube,
        datasets=datasets,
        empty_df=empty_df,
        indexed_columns=indexed_columns,
        load_columns=load_columns,
        restrictive_dataset_ids=restrictive_dataset_ids,
    )
    return intention, empty_df_single, groups
Пример #3
0
def test_schema_dataframe_rountrip(index, df_all_types):
    df = pd.DataFrame(df_all_types, index=index)

    schema = make_meta(df, origin="1")
    actual_df = empty_dataframe_from_schema(schema, date_as_object=True)
    validate_compatible([schema, make_meta(actual_df, origin="2")])
Пример #4
0
def test_empty_dataframe_from_schema_columns(df_all_types):
    schema = make_meta(df_all_types, origin="1")
    actual_df = empty_dataframe_from_schema(schema, ["uint64", "int64"])

    expected_df = df_all_types.loc[[], ["uint64", "int64"]]
    pdt.assert_frame_equal(actual_df, expected_df)
Пример #5
0
def read_table(
    dataset_uuid=None,
    store=None,
    table=None,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
):
    """
    A utility function to load a single table with multiple partitions as a single dataframe in one go.
    Mostly useful for smaller tables or datasets where all partitions fit into memory.

    The order of partitions is not guaranteed to be stable in the resulting dataframe.

    Parameters
    ----------
    table: str
        The table to be loaded
    columns: List[str]
        The columns to be loaded
    categoricals: List[str]
        A list of columns names which should be retrieved as a `pandas.Categorical`

    Returns
    -------
    pandas.DataFrame
        Returns a pandas.DataFrame holding the data of the requested columns

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_table

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> df = read_table(store, 'dataset_uuid', 'core')

    """

    if table is None:
        raise TypeError("Parameter `table` is not optional.")
    if not isinstance(table, str):
        raise TypeError("Argument `table` needs to be a string")

    columns = _check_compatible_list(table, columns, "columns")
    categoricals = _check_compatible_list(table, categoricals, "categoricals")

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )
    partitions = read_dataset_as_dataframes(
        tables=[table],
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
    )

    empty_df = empty_dataframe_from_schema(
        schema=ds_factory.table_meta[table],
        columns=columns[table] if columns is not None else None,
    )
    dfs = [partition_data[table] for partition_data in partitions] + [empty_df]
    # require meta 4 otherwise, can't construct types/columns
    if categoricals:
        dfs = align_categories(dfs, categoricals[table])
    df = pd.concat(dfs, ignore_index=True, sort=False)

    # ensure column order
    if len(empty_df.columns) > 0:
        df = df.reindex(empty_df.columns, copy=False, axis=1)

    return df