def test_empty_real(self): conj = Conjunction([]) assert conj.conditions == () assert str(conj) == "" assert conj.columns == set() assert conj.predicate is None assert conj.split_by_column() == {}
def test_empty_pseudo(self): cond = InIntervalCondition("x") conj = Conjunction([cond]) assert conj.conditions == (cond, ) assert str(conj) == "(x.in_interval(None, None))" assert conj.columns == set() assert conj.predicate is None assert conj.split_by_column() == {}
def test_from_string_ok(self, s, expected): all_types = {"sö": pa.string(), "bö": pa.bool_(), "iö": pa.int16()} actual = Conjunction.from_string(s, all_types) assert actual == expected s2 = str(actual) actual2 = Conjunction.from_string(s2, all_types) assert actual2 == actual
def test_filter_df_empty(self): cond = Conjunction([]) df = pd.DataFrame({ "foö": [13, 42, 42, 100], "bar": [1, 2, 3, 4], "z": 0.0 }) df_actual = cond.filter_df(df) pdt.assert_frame_equal(df_actual, df)
def test_stresstest_index_select_row(driver, function_store): n_indices = 100 n_rows = 1000 data = {"x": np.arange(n_rows), "p": 0} for i in range(n_indices): data["i{}".format(i)] = np.arange(n_rows) df = pd.DataFrame(data) cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", index_columns=["i{}".format(i) for i in range(n_indices)], ) build_cube(data=df, cube=cube, store=function_store) conditions = Conjunction([(C("i{}".format(i)) == 0) for i in range(n_indices)]) result = driver( cube=cube, store=function_store, conditions=conditions, payload_columns=["p", "x"], ) assert len(result) == 1 df_actual = result[0] df_expected = df.loc[df["x"] == 0].reindex(columns=["p", "x"]) pdt.assert_frame_equal(df_actual, df_expected)
def _ask_conditions(conditions, all_columns, all_types): txt = prompt( message="Conditions: ", history=_history_conditions, default=str(conditions) if conditions is not None else "", completer=WordCompleter(sorted(all_columns)), validator=_ValidatorFromParse( partial(Conjunction.from_string, all_types=all_types)), ) return Conjunction.from_string(txt, all_types)
def _dermine_load_columns(cube, datasets, intention): """ Determine which columns to load from given datasets. Parameters ---------- cube: Cube Cube specification. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Available datasets. intention: kartothek.io_components.cube.query._intention.QueryIntention Query intention. Returns ------- load_columns: Dict[str, Set[str]] Columns to load. """ result = {} for ktk_cube_dataset_id, ds in datasets.items(): is_seed = ktk_cube_dataset_id == cube.seed_dataset ds_cols = get_dataset_columns(ds) dimensionality = ds_cols & set(cube.dimension_columns) is_projection = not dimensionality.issubset( set(intention.dimension_columns)) mask = (set(intention.output_columns) | set(intention.dimension_columns) | intention.conditions_post.get(ktk_cube_dataset_id, Conjunction([])).columns) if not is_seed: # optimize load routine by only restore partition columns for seed mask -= set(cube.partition_columns) candidates = ds_cols & mask payload = candidates - set(cube.partition_columns) - set( cube.dimension_columns) payload_requested = len(payload) > 0 if is_seed or payload_requested: if is_projection and payload_requested: raise ValueError(( 'Cannot project dataset "{ktk_cube_dataset_id}" with dimensionality [{dimensionality}] to ' "[{dimension_columns}] while keeping the following payload intact: {payload}" ).format( ktk_cube_dataset_id=ktk_cube_dataset_id, dimensionality=", ".join(sorted(dimensionality)), dimension_columns=", ".join( sorted(intention.dimension_columns)), payload=", ".join(sorted(payload)), )) result[ktk_cube_dataset_id] = candidates return result
def apply_condition_unsafe(df, cond): # For the sparse_outer testset, the test_df has the wrong datatype because we cannot encode missing integer data in # pandas. # # The condition will not be applicable to the DF because the DF has floats while conditions have ints. We fix that # by modifying the the condition. # # In case there is no missing data because of the right conditions, kartothek will return integer data. # assert_frame_equal will then complain about this. So in case there is no missing data, let's recover the correct # dtype here. if not isinstance(cond, Conjunction): cond = Conjunction(cond) float_cols = {col for col in df.columns if df[col].dtype == float} # convert int to float conditions cond2 = Conjunction([]) for col, conj in cond.split_by_column().items(): if col in float_cols: parts = [] for part in conj.conditions: if isinstance(part, IsInCondition): part = IsInCondition(column=part.column, value=tuple( (float(v) for v in part.value))) elif isinstance(part, InIntervalCondition): part = InIntervalCondition( column=part.column, start=float(part.start), stop=float(part.stop), ) else: part = part.__class__(column=part.column, value=float(part.value)) parts.append(part) conj = Conjunction(parts) cond2 &= conj # apply conditions df = cond2.filter_df(df).reset_index(drop=True) # convert float columns to int columns for col in df.columns: if df[col].notnull().all(): dtype = df[col].dtype if dtype == np.float64: dtype = np.int64 elif dtype == np.float32: dtype = np.int32 elif dtype == np.float16: dtype = np.int16 df[col] = df[col].astype(dtype) return df
def _determine_restrictive_dataset_ids(cube, datasets, intention): """ Determine which datasets are restrictive. These are datasets which contain non-dimension columns and non-partition columns to which users wishes to apply restrictions (via conditions or via partition-by). Parameters ---------- cube: Cube Cube specification. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Available datasets. intention: kartothek.io_components.cube.query._intention.QueryIntention Query intention. Returns ------- restrictive_dataset_ids: Set[str] Set of restrictive datasets (by Ktk_cube dataset ID). """ result = set() for ktk_cube_dataset_id, dataset in datasets.items(): if ktk_cube_dataset_id == cube.seed_dataset: continue mask = (set(intention.partition_by) | intention.conditions_pre.get(ktk_cube_dataset_id, Conjunction([])).columns | intention.conditions_post.get( ktk_cube_dataset_id, Conjunction([])).columns) - (set( cube.dimension_columns) | set(cube.partition_columns)) overlap = mask & get_dataset_columns(dataset) if overlap: result.add(ktk_cube_dataset_id) return result
def test_multicol(self): col1 = C("foö") col2 = C("bar") cond1 = col1 < 10 cond2 = col1 > 0 cond3 = col2 != 10 conj1 = cond1 & cond2 conj2 = conj1 & cond3 assert isinstance(conj2, Conjunction) assert conj2.conditions == (cond1, cond2, cond3) assert str(conj2) == "(foö < 10) & (foö > 0) & (bar != 10)" assert conj2.columns == {"foö", "bar"} assert conj2.predicate == [ ("foö", "<", 10), ("foö", ">", 0), ("bar", "!=", 10), ] assert conj2.split_by_column() == { "foö": conj1, "bar": Conjunction([cond3]) }
def test_fails(self): with pytest.raises( TypeError, match="Can only build conjunction out of conditions."): Conjunction(1)
class TestConjunction: def test_simple(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 conj = cond1 & cond2 assert isinstance(conj, Conjunction) assert conj.conditions == (cond1, cond2) assert str(conj) == "(foö < 10) & (foö > 0)" assert conj.columns == {"foö"} assert conj.predicate == [("foö", "<", 10), ("foö", ">", 0)] assert conj.split_by_column() == {"foö": conj} def test_nested_conj_cond(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 cond3 = col != 10 conj1 = cond1 & cond2 conj2 = conj1 & cond3 assert isinstance(conj2, Conjunction) assert conj2.conditions == (cond1, cond2, cond3) assert str(conj2) == "(foö < 10) & (foö > 0) & (foö != 10)" assert conj2.columns == {"foö"} assert conj2.predicate == [ ("foö", "<", 10), ("foö", ">", 0), ("foö", "!=", 10), ] assert conj2.split_by_column() == {"foö": conj2} def test_nested_cond_conj(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 cond3 = col != 10 conj1 = cond2 & cond3 conj2 = cond1 & conj1 assert isinstance(conj2, Conjunction) assert conj2.conditions == (cond1, cond2, cond3) def test_nested_conj_conj(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 cond3 = col != 10 cond4 = col != 11 conj1 = cond1 & cond2 conj2 = cond3 & cond4 conj3 = conj1 & conj2 assert isinstance(conj3, Conjunction) assert conj3.conditions == (cond1, cond2, cond3, cond4) def test_fails_nocond(self): col = C("foö") cond1 = col < 10 with pytest.raises(TypeError) as exc: cond1 & col assert str( exc.value) == "Can only build conjunction out of conditions." def test_multicol(self): col1 = C("foö") col2 = C("bar") cond1 = col1 < 10 cond2 = col1 > 0 cond3 = col2 != 10 conj1 = cond1 & cond2 conj2 = conj1 & cond3 assert isinstance(conj2, Conjunction) assert conj2.conditions == (cond1, cond2, cond3) assert str(conj2) == "(foö < 10) & (foö > 0) & (bar != 10)" assert conj2.columns == {"foö", "bar"} assert conj2.predicate == [ ("foö", "<", 10), ("foö", ">", 0), ("bar", "!=", 10), ] assert conj2.split_by_column() == { "foö": conj1, "bar": Conjunction([cond3]) } def test_empty_real(self): conj = Conjunction([]) assert conj.conditions == () assert str(conj) == "" assert conj.columns == set() assert conj.predicate is None assert conj.split_by_column() == {} def test_empty_pseudo(self): cond = InIntervalCondition("x") conj = Conjunction([cond]) assert conj.conditions == (cond, ) assert str(conj) == "(x.in_interval(None, None))" assert conj.columns == set() assert conj.predicate is None assert conj.split_by_column() == {} def test_filter_df_some(self): cond = (C("foö") == 42) & (C("bar") == 2) df = pd.DataFrame({ "foö": [13, 42, 42, 100], "bar": [1, 2, 3, 4], "z": 0.0 }) df_actual = cond.filter_df(df) df_expected = df.loc[(df["foö"] == 42) & (df["bar"] == 2)] pdt.assert_frame_equal(df_actual, df_expected) def test_filter_df_empty(self): cond = Conjunction([]) df = pd.DataFrame({ "foö": [13, 42, 42, 100], "bar": [1, 2, 3, 4], "z": 0.0 }) df_actual = cond.filter_df(df) pdt.assert_frame_equal(df_actual, df) def test_filter_df_nulls(self): cond = (C("foö") != 42.0) & (C("bar") != 2.0) df = pd.DataFrame({ "foö": [13, 42, np.nan, np.nan], "bar": [1, 2, 3, np.nan], "z": np.nan }) df_actual = cond.filter_df(df) df_expected = pd.DataFrame({ "foö": [13.0], "bar": [1.0], "z": [np.nan] }) pdt.assert_frame_equal(df_actual, df_expected) def test_hash(self): col = C("foö") cond1 = col < 10 cond2 = col > 0 cond3 = col != 10 conj1a = cond1 & cond2 conj1b = cond1 & cond2 conj2 = cond1 & cond3 assert hash(conj1a) == hash(conj1b) assert hash(conj1a) != hash(conj2) @pytest.mark.parametrize( "s,expected", [ ("sö == a", Conjunction([C("sö") == "a"])), ("sö == a & iö < 10", Conjunction([C("sö") == "a", C("iö") < 10])), ("(sö == a) & (iö < 10)", Conjunction([C("sö") == "a", C("iö") < 10])), ("", Conjunction([])), (" ", Conjunction([])), ], ) def test_from_string_ok(self, s, expected): all_types = {"sö": pa.string(), "bö": pa.bool_(), "iö": pa.int16()} actual = Conjunction.from_string(s, all_types) assert actual == expected s2 = str(actual) actual2 = Conjunction.from_string(s2, all_types) assert actual2 == actual @pytest.mark.parametrize( "obj,expected", [ ( # obj C("foö") > 1, # expected Conjunction([C("foö") > 1]), ), ( # obj [C("foö") > 1], # expected Conjunction([C("foö") > 1]), ), ( # obj [C("foö") > 1, C("bar") < 1], # expected Conjunction([C("foö") > 1, C("bar") < 1]), ), ( # obj Conjunction([C("foö") > 1, C("bar") < 1]), # expected Conjunction([C("foö") > 1, C("bar") < 1]), ), ( # obj None, # expected Conjunction([]), ), ], ) def test_init_from_obj(self, obj, expected): actual = Conjunction(obj) assert actual == expected def test_fails(self): with pytest.raises( TypeError, match="Can only build conjunction out of conditions."): Conjunction(1) def test_json_serialization_ok(self): conj = Conjunction([ EqualityCondition(column="foö", value=1.2), GreaterEqualCondition(column="foö", value=1.2), GreaterThanCondition(column="foö", value=1.2), InequalityCondition(column="foö", value=1.2), LessEqualCondition(column="foö", value=1.2), LessThanCondition(column="foö", value=1.2), InIntervalCondition(column="foö", start=1.2, stop=2.3), IsInCondition(column="foö", value=[1.2, 1.3]), ]) array_actual = conj.to_jsonarray() array_expected = [ { "type": "EqualityCondition", "column": "foö", "value": 1.2 }, { "type": "GreaterEqualCondition", "column": "foö", "value": 1.2 }, { "type": "GreaterThanCondition", "column": "foö", "value": 1.2 }, { "type": "InequalityCondition", "column": "foö", "value": 1.2 }, { "type": "LessEqualCondition", "column": "foö", "value": 1.2 }, { "type": "LessThanCondition", "column": "foö", "value": 1.2 }, { "type": "InIntervalCondition", "column": "foö", "start": 1.2, "stop": 2.3 }, { "type": "IsInCondition", "column": "foö", "value": [1.2, 1.3] }, ] assert array_actual == array_expected conj2 = Conjunction.from_jsonarray(array_actual) assert conj2 == conj # input not altered assert array_actual == array_expected @pytest.mark.parametrize( "array", [ [{ "type": "str" }], [{ "type": "Condition" }], [{ "type": "C" }], [{ "type": "Conjunction" }], [{ "type": "SimpleCondition" }], [{ "type": "VirtualColumn" }], [{ "type": "FooBar" }], [{ "type": "" }], [{ "type": " " }], ], ) def test_json_serialization_fail_type(self, array): with pytest.raises(TypeError, match="Unknown condition class"): Conjunction.from_jsonarray(array) def test_json_serialization_fail_no_list(self): with pytest.raises(TypeError, match="jsonarray must be a list"): Conjunction.from_jsonarray({}) def test_json_serialization_fail_no_cond_dict(self): with pytest.raises(TypeError, match="Condition in jsonarray must be a dict"): Conjunction.from_jsonarray([1]) def test_json_serialization_fail_type_missing(self): with pytest.raises(ValueError, match="Missing type value for condition"): Conjunction.from_jsonarray([{}])
def prepare_metapartitions_for_removal_action(cube, store, conditions, ktk_cube_dataset_ids, existing_datasets): """ Prepare MetaPartition to express removal of given data range from cube. The MetaPartition must still be written using ``mp.store_dataframes(...)`` and added to the Dataset using a kartothek update method. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube spec. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Store. conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied, optional. Defaults to "entire cube". ktk_cube_dataset_ids: Optional[Union[Iterable[str], str]] Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all". existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Existing datasets. Returns ------- metapartitions: Dict[str, Tuple[kartothek.core.dataset.DatasetMetadata, kartothek.io_components.metapartition.MetaPartition, List[Dict[str, Any]]]] MetaPartitions that should be written and updatet to the kartothek datasets as well as the ``delete_scope`` for kartothek. """ conditions = Conjunction(conditions) conditions_split = conditions.split_by_column() if set(conditions_split.keys()) - set(cube.partition_columns): raise ValueError( "Can only remove partitions with conditions concerning cubes physical partition columns." ) ktk_cube_dataset_ids = converter_str_set_optional(ktk_cube_dataset_ids) if ktk_cube_dataset_ids is not None: unknown_dataset_ids = ktk_cube_dataset_ids - set( existing_datasets.keys()) if unknown_dataset_ids: raise ValueError("Unknown ktk_cube_dataset_ids: {}".format( ", ".join(sorted(unknown_dataset_ids)))) else: ktk_cube_dataset_ids = set(existing_datasets.keys()) metapartitions = {} for ktk_cube_dataset_id in ktk_cube_dataset_ids: ds = existing_datasets[ktk_cube_dataset_id] ds = ds.load_partition_indices() mp = _prepare_mp_empty(ds) if not ds.partition_keys: # no partition keys --> delete all delete_scope = [{}] else: df_partitions = get_partition_dataframe(dataset=ds, cube=cube) df_partitions = df_partitions.drop_duplicates() local_condition = reduce( lambda a, b: a & b, (cond for col, cond in conditions_split.items() if col in df_partitions.columns), Conjunction([]), ) df_partitions = local_condition.filter_df(df_partitions) delete_scope = df_partitions.to_dict(orient="records") metapartitions[ktk_cube_dataset_id] = (ds, mp, delete_scope) return metapartitions
def _process_conditions( conditions, cube, datasets, all_available_columns, indexed_columns ): """ Process and check given query conditions. Parameters ---------- conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied. cube: Cube Cube specification. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are present. all_available_columns: Set[str] All columns that are available for query. indexed_columns: Dict[str, Set[str]] Indexed columns per ktk_cube dataset ID. Returns ------- conditions_pre: Dict[str, kartothek.core.cube.conditions.Conjunction] Conditions to be applied based on the index data alone. conditions_post: Dict[str, kartothek.core.cube.conditions.Conjunction] Conditions to be applied during the load process. Raises ------- TypeError: In case of a wrong type. """ conditions = Conjunction(conditions) condition_columns = conditions.columns missing = condition_columns - all_available_columns if missing: raise ValueError( "Following condition columns are required but are missing from the cube: {missing}".format( missing=", ".join(sorted(missing)) ) ) _test_condition_types(conditions, datasets) conditions_split = conditions.split_by_column() conditions_pre = {} for ktk_cube_dataset_id, ds in datasets.items(): candidate_cols = indexed_columns[ktk_cube_dataset_id] if not candidate_cols: continue filtered = [ conj for col, conj in conditions_split.items() if col in candidate_cols ] if not filtered: continue conditions_pre[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered) conditions_post = {} for ktk_cube_dataset_id, ds in datasets.items(): candidate_cols = (get_dataset_columns(ds) & condition_columns) - set( cube.partition_columns ) if not candidate_cols: continue filtered = [ conj for col, conj in conditions_split.items() if col in candidate_cols ] if not filtered: continue conditions_post[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered) return conditions_pre, conditions_post
def test_json_serialization_fail_no_cond_dict(self): with pytest.raises(TypeError, match="Condition in jsonarray must be a dict"): Conjunction.from_jsonarray([1])
def test_json_serialization_fail_type_missing(self): with pytest.raises(ValueError, match="Missing type value for condition"): Conjunction.from_jsonarray([{}])
def test_json_serialization_fail_no_list(self): with pytest.raises(TypeError, match="jsonarray must be a list"): Conjunction.from_jsonarray({})
def test_json_serialization_fail_type(self, array): with pytest.raises(TypeError, match="Unknown condition class"): Conjunction.from_jsonarray(array)
C("v1") >= 7, C("v1") >= 10000, C("v2") >= 7, C("v3") >= 3, C("i1") >= 7, C("i1") >= 10000, C("i2") >= 7, C("i2") != 0, C("i3") >= 3, C("p") >= 1, C("q") >= 1, C("x") >= 1, C("y") >= 1, (C("x") == 3) & (C("y") == 3), (C("i1") > 0) & (C("i2") > 0), Conjunction([]), ], ) def test_condition(driver, module_store, test_cube, test_df, cond): result = driver(cube=test_cube, store=module_store, conditions=cond) df_expected = apply_condition_unsafe(test_df, cond) if df_expected.empty: assert len(result) == 0 else: assert len(result) == 1 df_actual = result[0] pdt.assert_frame_equal(df_actual, df_expected)
def _create_aligned_partition_df(datasets, cube, intention, indexed_columns, restrictive_dataset_ids): """ Create DataFrame w/ aligned partitions. The output will have a single row per partition that shares the same physical partition and the same partition-by attributes. For this, the following columns are present: - ``'__ktk_cube_labels_<ktk_cube dataset ID>'``: a column per dataset w/ either NULL or a list of labels that belong to the partition entry - physical partition columns - additional partition-by columns Parameters ---------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are processed by the regrouper. cube: Cube Cube specification. intention: kartothek.io_components.cube.query._intention.QueryIntention Query intention. indexed_columns: Dict[str, Set[str]] Indexed columns per ktk_cube dataset ID. restrictive_dataset_ids: Set[str] Datasets (by Ktk_cube dataset ID) that are restrictive during the join process. Returns ------- df_aligned: pandas.DataFrame Aligned partitions-DF. """ # Stage 1: Partition DataFrames per Dataset. # # These DataFrames are classified in 3 categories: # - seed: seed dataset # - restrict: conditions are applied (therefore data must be present) # - other: not a seed and w/o any condition df_seed = None dfs_restrict = [] dfs_other = [] for ktk_cube_dataset_id, ds in datasets.items(): preconditions = intention.conditions_pre.get(ktk_cube_dataset_id, Conjunction([])) local_partition_by = sorted(indexed_columns[ktk_cube_dataset_id] & set(intention.partition_by)) df = _create_dataset_df( preconditions=preconditions, ktk_cube_dataset_id=ktk_cube_dataset_id, ds=ds, cube=cube, local_partition_by=local_partition_by, ) # categorize if ktk_cube_dataset_id == cube.seed_dataset: assert df_seed is None df_seed = df elif ktk_cube_dataset_id in restrictive_dataset_ids: dfs_restrict.append(df) else: dfs_other.append(df) # Stage 2: Alignment # # Partition DataFrames are aligned based on Cube.partition_columns and their category. assert df_seed is not None df_aligned = df_seed for df_join in dfs_restrict: df_aligned = merge_dataframes_robust(df_aligned, df_join, how="inner") for df_join in dfs_other: df_aligned = merge_dataframes_robust(df_aligned, df_join, how="left") return df_aligned
def test_init_from_obj(self, obj, expected): actual = Conjunction(obj) assert actual == expected
def _regroup(df_aligned, intention, indexed_columns, datasets, cube): """ Based on partition_by, form query groups. .. important:: If tine intention does not contain a partition-by, this partition by the cube partition columns to speed up the query on parallel backends. In that case, the backend must concat and check the resulting dataframes before passing it to the user. Parameters ---------- df_aligned: pandas.DataFrame aligned DataFrame, taken from :meth:`_create_aligned_partition_df` intention: kartothek.io_components.cube.query._intention.QueryIntention Query intention. indexed_columns: Dict[str, Set[str]] Indexed columns per ktk_cube dataset ID. datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that are processed by the regrouper. cube: Cube Cube specification. Returns ------- label2gp: Dict[str, Dict[str, Tuple[int, int]]] Maps "dataset ID -> (label -> (group ID, partition ID))". group2cond: Dict[int, kartothek.core.cube.conditions.Conjunction] Condition per group. """ partition_by = intention.partition_by if not partition_by: # special code to speed up the query partition_by = cube.partition_columns label2gp = defaultdict(lambda: defaultdict(list)) group2cond = {} # figure out which datasets are affected by which additional condition extra_conditions_target = {} for ktk_cube_dataset_id, cols in indexed_columns.items(): if ktk_cube_dataset_id not in datasets: # may be irrelevant continue for col in cols & set(partition_by): extra_conditions_target[col] = ktk_cube_dataset_id # generate groups for g, df_g in df_aligned.groupby(list(partition_by), sort=True): gid = g if len(partition_by) == 1: g = (g, ) conditions_g = copy(intention.conditions_post) for g_part, col in zip(g, partition_by): if col in cube.partition_columns: # we do not need predicate pushdown for physical partition columns continue ktk_cube_dataset_id = extra_conditions_target[col] conditions_g[ktk_cube_dataset_id] = conditions_g.get( ktk_cube_dataset_id, Conjunction([])) & (C(col) == g_part) _aligned_df_to_label2gp(df_g, datasets, gid, label2gp) group2cond[gid] = conditions_g return label2gp, group2cond
def test_json_serialization_ok(self): conj = Conjunction([ EqualityCondition(column="foö", value=1.2), GreaterEqualCondition(column="foö", value=1.2), GreaterThanCondition(column="foö", value=1.2), InequalityCondition(column="foö", value=1.2), LessEqualCondition(column="foö", value=1.2), LessThanCondition(column="foö", value=1.2), InIntervalCondition(column="foö", start=1.2, stop=2.3), IsInCondition(column="foö", value=[1.2, 1.3]), ]) array_actual = conj.to_jsonarray() array_expected = [ { "type": "EqualityCondition", "column": "foö", "value": 1.2 }, { "type": "GreaterEqualCondition", "column": "foö", "value": 1.2 }, { "type": "GreaterThanCondition", "column": "foö", "value": 1.2 }, { "type": "InequalityCondition", "column": "foö", "value": 1.2 }, { "type": "LessEqualCondition", "column": "foö", "value": 1.2 }, { "type": "LessThanCondition", "column": "foö", "value": 1.2 }, { "type": "InIntervalCondition", "column": "foö", "start": 1.2, "stop": 2.3 }, { "type": "IsInCondition", "column": "foö", "value": [1.2, 1.3] }, ] assert array_actual == array_expected conj2 = Conjunction.from_jsonarray(array_actual) assert conj2 == conj # input not altered assert array_actual == array_expected