示例#1
0
def apply_postwrite_checks(datasets, cube, store, existing_datasets):
    """
    Apply sanity checks that can only be done after Kartothek has written its datasets.

    Parameters
    ----------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that just got written.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        KV store.
    existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that were present before the write procedure started.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that just got written.

    Raises
    ------
    ValueError
        If sanity check failed.
    """
    try:
        empty_datasets = {
            ktk_cube_dataset_id
            for ktk_cube_dataset_id, ds in datasets.items()
            if len(ds.partitions) == 0
        }

        if empty_datasets:
            raise ValueError(
                "Cannot write empty datasets: {empty_datasets}".format(
                    empty_datasets=", ".join(sorted(empty_datasets))))

        datasets_to_check = copy(existing_datasets)
        datasets_to_check.update(datasets)
        check_datasets(datasets_to_check, cube)
    except Exception as e:
        _rollback_transaction(existing_datasets=existing_datasets,
                              new_datasets=datasets,
                              store=store)

        raise MultiTableCommitAborted(
            "Post commit check failed. Operation rolled back.") from e

    return datasets
def discover_datasets(
    cube: Cube,
    store: StoreInput,
    filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Get all known datasets that belong to a give cube.

    Parameters
    ----------
    cube
        Cube specification.
    store
        KV store.
    filter_ktk_cube_dataset_ids
        Optional selection of datasets to include.

    Returns
    -------
    datasets: Dict[str, DatasetMetadata]
        All discovered datasets.

    Raises
    ------
    ValueError
        In case no valid cube could be discovered.
    """
    filter_ktk_cube_dataset_ids = converter_str_set_optional(
        filter_ktk_cube_dataset_ids
    )
    result = discover_datasets_unchecked(
        cube.uuid_prefix, store, filter_ktk_cube_dataset_ids
    )
    if filter_ktk_cube_dataset_ids is not None:
        if isinstance(filter_ktk_cube_dataset_ids, str):
            filter_ktk_cube_dataset_ids = {filter_ktk_cube_dataset_ids}
        else:
            filter_ktk_cube_dataset_ids = set(filter_ktk_cube_dataset_ids)
        missing = filter_ktk_cube_dataset_ids - set(result.keys())
        if missing:
            raise ValueError(
                "Could not find the following requested datasets: {missing}".format(
                    missing=", ".join(sorted(missing))
                )
            )
    check_datasets(result, cube)

    return result
示例#3
0
def discover_datasets(cube, store, filter_ktk_cube_dataset_ids=None):
    """
    Get all known datasets that belong to a give cube.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
        KV store.
    filter_ktk_cube_dataset_ids: Union[None, str, Iterable[str]]
        Optional selection of datasets to include.

    Returns
    -------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        All discovered datasets.

    Raises
    ------
    ValueError
        In case no valid cube could be discovered.
    """
    filter_ktk_cube_dataset_ids = converter_str_set_optional(
        filter_ktk_cube_dataset_ids)
    result = discover_datasets_unchecked(cube.uuid_prefix, store,
                                         filter_ktk_cube_dataset_ids)
    if filter_ktk_cube_dataset_ids is not None:
        missing = filter_ktk_cube_dataset_ids - set(result.keys())
        if missing:
            raise ValueError(
                "Could not find the following requested datasets: {missing}".
                format(missing=", ".join(sorted(missing))))
    check_datasets(result, cube)

    return result
示例#4
0
def discover_cube(
    uuid_prefix: str,
    store: Union[Callable[[], KeyValueStore]],
    filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None,
) -> Tuple[Cube, Dict[str, DatasetMetadata]]:
    """
    Recover cube information from store.

    Parameters
    ----------
    uuid_prefix
        Dataset UUID prefix.
    store
        KV store.
    filter_ktk_cube_dataset_ids
        Optional selection of datasets to include.

    Returns
    -------
    cube: Cube
        Cube specification.
    datasets: Dict[str, DatasetMetadata]
        All discovered datasets.
    """
    datasets = discover_datasets_unchecked(uuid_prefix, store,
                                           filter_ktk_cube_dataset_ids)

    seed_candidates = {
        ktk_cube_dataset_id
        for ktk_cube_dataset_id, ds in datasets.items()
        if ds.metadata.get(KTK_CUBE_METADATA_KEY_IS_SEED,
                           ds.metadata.get("klee_is_seed", False))
    }
    if len(seed_candidates) == 0:
        raise ValueError(
            'Could not find seed dataset for cube "{uuid_prefix}".'.format(
                uuid_prefix=uuid_prefix))
    elif len(seed_candidates) > 1:
        raise ValueError(
            'Found multiple possible seed datasets for cube "{uuid_prefix}": {seed_candidates}'
            .format(
                uuid_prefix=uuid_prefix,
                seed_candidates=", ".join(sorted(seed_candidates)),
            ))
    seed_dataset = list(seed_candidates)[0]

    seed_ds = datasets[seed_dataset]
    dimension_columns = seed_ds.metadata.get(
        KTK_CUBE_METADATA_DIMENSION_COLUMNS,
        seed_ds.metadata.get("klee_dimension_columns"),
    )
    if dimension_columns is None:
        raise ValueError(
            'Could not recover dimension columns from seed dataset ("{seed_dataset}") of cube "{uuid_prefix}".'
            .format(seed_dataset=seed_dataset, uuid_prefix=uuid_prefix))

    # datasets written with new kartothek versions (after merge of PR#7747)
    # always set KTK_CUBE_METADATA_PARTITION_COLUMNS and "klee_timestamp_column" in the metadata.
    # Older versions of ktk_cube do not write these; instead, these columns are inferred from
    # the actual partitioning: partition_columns are all but the last partition key
    #
    # TODO: once we're sure we have re-written all kartothek cubes, the code
    # in the branch `if partition_columns is None` below can be removed.
    #
    # read the now unused timestamp column just to make sure we can still read older cubes.
    #
    # TODO: once all cubes are re-created and don't use timestamp column anymore, remove the timestamp column handling
    #       entirely
    partition_columns = seed_ds.metadata.get(
        KTK_CUBE_METADATA_PARTITION_COLUMNS,
        seed_ds.metadata.get("klee_partition_columns"),
    )
    timestamp_column = seed_ds.metadata.get("klee_timestamp_column")

    if partition_columns is None:
        # infer the partition columns and timestamp column from the actual partitioning:
        partition_keys = seed_ds.partition_keys
        if len(partition_keys) == 0:
            raise ValueError(
                'Seed dataset ("{seed_dataset}") has no partition keys.'.
                format(  # type: ignore # noqa
                    seed_dataset=seed_dataset,
                    partition_keys=", ".join(partition_keys),
                ))
        elif len(partition_keys) < 2:
            raise ValueError((
                'Seed dataset ("{seed_dataset}") has only a single partition key ({partition_key}) '
                "but should have at least 2.").format(
                    seed_dataset=seed_dataset,
                    partition_key=partition_keys[0]))
        partition_columns = partition_keys[:-1]
        timestamp_column = partition_keys[-1]

    index_columns = set()
    for ds in datasets.values():
        index_columns |= set(ds.indices.keys()) - (set(dimension_columns)
                                                   | set(partition_columns)
                                                   | {timestamp_column})

    # we only support the default timestamp column in the compat code
    if (timestamp_column is not None) and (timestamp_column != "KLEE_TS"):
        raise NotImplementedError(
            f"Can only read old cubes if the timestamp column is 'KLEE_TS', but '{timestamp_column}' was detected."
        )

    cube = Cube(
        uuid_prefix=uuid_prefix,
        dimension_columns=dimension_columns,
        partition_columns=partition_columns,
        index_columns=index_columns,
        seed_dataset=seed_dataset,
    )

    datasets = check_datasets(datasets, cube)
    return cube, datasets
def plan_query(
    conditions,
    cube,
    datasets,
    dimension_columns,
    partition_by,
    payload_columns,
    store,
):
    """
    Plan cube query execution.

    .. important::
        If the intention does not contain a partition-by, this partition by the cube partition columns to speed up the
        query on parallel backends. In that case, the backend must concat and check the resulting dataframes before
        passing it to the user.

    Parameters
    ----------
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied.
    cube: Cube
        Cube specification.
    datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]]
        Datasets to query, must all be part of the cube.
    dimension_columns: Optional[Iterable[str]]
        Dimension columns of the query, may result in projection.
    partition_by: Optional[Iterable[str]]
        By which column logical partitions should be formed.
    payload_columns: Optional[Iterable[str]]
        Which columns apart from ``dimension_columns`` and ``partition_by`` should be returned.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store to query from.

    Returns
    -------
    intent: QueryIntention
        Query intention.
    empty_df: pandas.DataFrame
        Empty DataFrame representing the output types.
    groups: Tuple[QueryGroup]
        Tuple of query groups. May be empty.
    """
    if callable(store):
        store = store()

    if not isinstance(datasets, dict):
        datasets = discover_datasets(cube=cube,
                                     store=store,
                                     filter_ktk_cube_dataset_ids=datasets)
    else:
        datasets = check_datasets(datasets, cube)

    datasets = {
        ktk_cube_dataset_id: ds.load_partition_indices()
        for ktk_cube_dataset_id, ds in datasets.items()
    }
    indexed_columns = _get_indexed_columns(datasets)

    intention = determine_intention(
        cube=cube,
        datasets=datasets,
        dimension_columns=dimension_columns,
        partition_by=partition_by,
        conditions=conditions,
        payload_columns=payload_columns,
        indexed_columns=indexed_columns,
    )

    datasets = _load_required_explicit_indices(datasets, intention, store)

    restrictive_dataset_ids = _determine_restrictive_dataset_ids(
        cube=cube, datasets=datasets, intention=intention)

    load_columns = _dermine_load_columns(cube=cube,
                                         datasets=datasets,
                                         intention=intention)

    datasets = _filter_relevant_datasets(datasets=datasets,
                                         load_columns=load_columns)

    empty_df = {
        ktk_cube_dataset_id: _reduce_empty_dtype_sizes(
            empty_dataframe_from_schema(
                schema=ds.schema,
                columns=sorted(
                    get_dataset_columns(ds)
                    & set(load_columns[ktk_cube_dataset_id])),
            ))
        for ktk_cube_dataset_id, ds in datasets.items()
    }

    empty_df_single = empty_df[cube.seed_dataset].copy()
    for k, df in empty_df.items():
        if k == cube.seed_dataset:
            continue
        if empty_df_single is None:
            empty_df_single = df.copy()
        else:
            empty_df_single = empty_df_single.merge(df)
    empty_df_single = empty_df_single[list(intention.output_columns)]

    groups = regroup(
        intention,
        cube=cube,
        datasets=datasets,
        empty_df=empty_df,
        indexed_columns=indexed_columns,
        load_columns=load_columns,
        restrictive_dataset_ids=restrictive_dataset_ids,
    )
    return intention, empty_df_single, groups