def test_empty_real(self): conj = Conjunction([]) assert conj.conditions == () assert str(conj) == "" assert conj.columns == set() assert conj.predicate is None assert conj.split_by_column() == {}
def test_empty_pseudo(self): cond = InIntervalCondition("x") conj = Conjunction([cond]) assert conj.conditions == (cond, ) assert str(conj) == "(x.in_interval(None, None))" assert conj.columns == set() assert conj.predicate is None assert conj.split_by_column() == {}
def apply_condition_unsafe(df, cond): # For the sparse_outer testset, the test_df has the wrong datatype because we cannot encode missing integer data in # pandas. # # The condition will not be applicable to the DF because the DF has floats while conditions have ints. We fix that # by modifying the the condition. # # In case there is no missing data because of the right conditions, kartothek will return integer data. # assert_frame_equal will then complain about this. So in case there is no missing data, let's recover the correct # dtype here. if not isinstance(cond, Conjunction): cond = Conjunction(cond) float_cols = {col for col in df.columns if df[col].dtype == float} # convert int to float conditions cond2 = Conjunction([]) for col, conj in cond.split_by_column().items(): if col in float_cols: parts = [] for part in conj.conditions: if isinstance(part, IsInCondition): part = IsInCondition(column=part.column, value=tuple( (float(v) for v in part.value))) elif isinstance(part, InIntervalCondition): part = InIntervalCondition( column=part.column, start=float(part.start), stop=float(part.stop), ) else: part = part.__class__(column=part.column, value=float(part.value)) parts.append(part) conj = Conjunction(parts) cond2 &= conj # apply conditions df = cond2.filter_df(df).reset_index(drop=True) # convert float columns to int columns for col in df.columns: if df[col].notnull().all(): dtype = df[col].dtype if dtype == np.float64: dtype = np.int64 elif dtype == np.float32: dtype = np.int32 elif dtype == np.float16: dtype = np.int16 df[col] = df[col].astype(dtype) return df
def prepare_metapartitions_for_removal_action(cube, store, conditions, ktk_cube_dataset_ids, existing_datasets): """ Prepare MetaPartition to express removal of given data range from cube. The MetaPartition must still be written using ``mp.store_dataframes(...)`` and added to the Dataset using a kartothek update method. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube spec. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Store. conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied, optional. Defaults to "entire cube". ktk_cube_dataset_ids: Optional[Union[Iterable[str], str]] Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Existing datasets. Returns ------- metapartitions: Dict[str, Tuple[kartothek.core.dataset.DatasetMetadata, kartothek.io_components.metapartition.MetaPartition, List[Dict[str, Any]]]] MetaPartitions that should be written and updatet to the kartothek datasets as well as the ``delete_scope`` for kartothek. """ conditions = Conjunction(conditions) conditions_split = conditions.split_by_column() if set(conditions_split.keys()) - set(cube.partition_columns): raise ValueError( "Can only remove partitions with conditions concerning cubes physical partition columns." ) ktk_cube_dataset_ids = converter_str_set_optional(ktk_cube_dataset_ids) if ktk_cube_dataset_ids is not None: unknown_dataset_ids = ktk_cube_dataset_ids - set( existing_datasets.keys()) if unknown_dataset_ids: raise ValueError("Unknown ktk_cube_dataset_ids: {}".format( ", ".join(sorted(unknown_dataset_ids)))) else: ktk_cube_dataset_ids = set(existing_datasets.keys()) metapartitions = {} for ktk_cube_dataset_id in ktk_cube_dataset_ids: ds = existing_datasets[ktk_cube_dataset_id] ds = ds.load_partition_indices() mp = _prepare_mp_empty(ds) if not ds.partition_keys: # no partition keys --> delete all delete_scope = [{}] else: df_partitions = get_partition_dataframe(dataset=ds, cube=cube) df_partitions = df_partitions.drop_duplicates() local_condition = reduce( lambda a, b: a & b, (cond for col, cond in conditions_split.items() if col in df_partitions.columns), Conjunction([]), ) df_partitions = local_condition.filter_df(df_partitions) delete_scope = df_partitions.to_dict(orient="records") metapartitions[ktk_cube_dataset_id] = (ds, mp, delete_scope) return metapartitions
def _process_conditions( conditions, cube, datasets, all_available_columns, indexed_columns ): """ Process and check given query conditions. Parameters ---------- conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied. cube: Cube Cube specification. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are present. all_available_columns: Set[str] All columns that are available for query. indexed_columns: Dict[str, Set[str]] Indexed columns per ktk_cube dataset ID. Returns ------- conditions_pre: Dict[str, kartothek.core.cube.conditions.Conjunction] Conditions to be applied based on the index data alone. conditions_post: Dict[str, kartothek.core.cube.conditions.Conjunction] Conditions to be applied during the load process. Raises ------- TypeError: In case of a wrong type. """ conditions = Conjunction(conditions) condition_columns = conditions.columns missing = condition_columns - all_available_columns if missing: raise ValueError( "Following condition columns are required but are missing from the cube: {missing}".format( missing=", ".join(sorted(missing)) ) ) _test_condition_types(conditions, datasets) conditions_split = conditions.split_by_column() conditions_pre = {} for ktk_cube_dataset_id, ds in datasets.items(): candidate_cols = indexed_columns[ktk_cube_dataset_id] if not candidate_cols: continue filtered = [ conj for col, conj in conditions_split.items() if col in candidate_cols ] if not filtered: continue conditions_pre[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered) conditions_post = {} for ktk_cube_dataset_id, ds in datasets.items(): candidate_cols = (get_dataset_columns(ds) & condition_columns) - set( cube.partition_columns ) if not candidate_cols: continue filtered = [ conj for col, conj in conditions_split.items() if col in candidate_cols ] if not filtered: continue conditions_post[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered) return conditions_pre, conditions_post