def determine_intention( cube, datasets, dimension_columns, partition_by, conditions, payload_columns, indexed_columns, ): """ Dermine and check user intention during the query process. Parameters ---------- cube: Cube Cube specification. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are present. dimension_columns: Optional[Iterable[str]] Dimension columns of the query, may result in projection. partition_by: Optional[Iterable[str]] By which column logical partitions should be formed. conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied. payload_columns: Optional[Iterable[str]] Which columns apart from ``dimension_columns`` and ``partition_by`` should be returned from the query. indexed_columns: Dict[str, Set[str]] Indexed columns per ktk_cube dataset ID. Returns ------- intention: QueryIntention Checked and filled in intention of the user. """ all_available_columns = set( itertools.chain.from_iterable( [get_dataset_columns(ds) for ds in datasets.values()])) dimension_columns = _process_dimension_columns( dimension_columns=dimension_columns, cube=cube) partition_by = _process_partition_by( partition_by=partition_by, cube=cube, all_available_columns=all_available_columns, indexed_columns=indexed_columns, ) conditions_pre, conditions_post = _process_conditions( conditions=conditions, cube=cube, datasets=datasets, all_available_columns=all_available_columns, indexed_columns=indexed_columns, ) payload_columns = _process_payload( payload_columns=payload_columns, all_available_columns=all_available_columns, cube=cube, ) output_columns = tuple( sorted( set(partition_by) | set(dimension_columns) | set(payload_columns) | set(cube.partition_columns))) return QueryIntention( dimension_columns=dimension_columns, partition_by=partition_by, conditions_pre=conditions_pre, conditions_post=conditions_post, output_columns=output_columns, )
def _check_indices(datasets: Dict[str, DatasetMetadata], cube: Cube) -> None: """ Check if required indices are present in given datasets. For all datasets the primary indices must be equal to ``ds.partition_keys``. For the seed dataset, secondary indices for all dimension columns except ``cube.suppress_index_on`` are expected. Additional indices are accepted and will not be reported as error. Parameters ---------- datasets Datasets. cube Cube specification. Raises ------ ValueError: In case indices are broken. """ for ktk_cube_dataset_id in sorted(datasets.keys()): ds = datasets[ktk_cube_dataset_id] primary_indices = ds.partition_keys columns = get_dataset_columns(ds) secondary_indices = set() any_indices = set(cube.index_columns) & columns if ktk_cube_dataset_id == cube.seed_dataset: secondary_indices |= set(cube.dimension_columns) - set( cube.suppress_index_on) for types_untyped, elements in ( ((PartitionIndex, ), primary_indices), ((ExplicitSecondaryIndex, ), secondary_indices), ((ExplicitSecondaryIndex, PartitionIndex), any_indices), ): types = cast(Tuple[type, ...], types_untyped) tname = " or ".join(t.__name__ for t in types) # it seems that partition indices are not always present (e.g. for empty datasets), so add partition keys to # the set indices = cast(Dict[str, Union[IndexBase, str]], copy(ds.indices)) if PartitionIndex in types: for pk in ds.partition_keys: if pk not in indices: indices[pk] = "dummy" for e in sorted(elements): if e not in indices: raise ValueError( '{tname} "{e}" is missing in dataset "{ktk_cube_dataset_id}".' .format(tname=tname, e=e, ktk_cube_dataset_id=ktk_cube_dataset_id)) idx = indices[e] t2 = type(idx) tname2 = t2.__name__ if (idx != "dummy") and (not isinstance(idx, types)): raise ValueError( '"{e}" in dataset "{ktk_cube_dataset_id}" is of type {tname2} but should be {tname}.' .format( tname=tname, tname2=tname2, e=e, ktk_cube_dataset_id=ktk_cube_dataset_id, ))
def _process_conditions(conditions, cube, datasets, all_available_columns, indexed_columns): """ Process and check given query conditions. Parameters ---------- conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied. cube: Cube Cube specification. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are present. all_available_columns: Set[str] All columns that are available for query. indexed_columns: Dict[str, Set[str]] Indexed columns per ktk_cube dataset ID. Returns ------- conditions_pre: Dict[str, Conjuction] Conditions to be applied based on the index data alone. conditions_post: Dict[str, Conjuction] Conditions to be applied during the load process. Raises ------- TypeError: In case of a wrong type. """ conditions = Conjunction(conditions) condition_columns = conditions.columns missing = condition_columns - all_available_columns if missing: raise ValueError( "Following condition columns are required but are missing from the cube: {missing}" .format(missing=", ".join(sorted(missing)))) _test_condition_types(conditions, datasets) conditions_split = conditions.split_by_column() conditions_pre = {} for ktk_cube_dataset_id, ds in datasets.items(): candidate_cols = indexed_columns[ktk_cube_dataset_id] if not candidate_cols: continue filtered = [ conj for col, conj in conditions_split.items() if col in candidate_cols ] if not filtered: continue conditions_pre[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered) conditions_post = {} for ktk_cube_dataset_id, ds in datasets.items(): candidate_cols = (get_dataset_columns(ds) & condition_columns) - set( cube.partition_columns) if not candidate_cols: continue filtered = [ conj for col, conj in conditions_split.items() if col in candidate_cols ] if not filtered: continue conditions_post[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered) return conditions_pre, conditions_post