Exemplo n.º 1
0
 def test_empty_real(self):
     conj = Conjunction([])
     assert conj.conditions == ()
     assert str(conj) == ""
     assert conj.columns == set()
     assert conj.predicate is None
     assert conj.split_by_column() == {}
Exemplo n.º 2
0
 def test_empty_pseudo(self):
     cond = InIntervalCondition("x")
     conj = Conjunction([cond])
     assert conj.conditions == (cond, )
     assert str(conj) == "(x.in_interval(None, None))"
     assert conj.columns == set()
     assert conj.predicate is None
     assert conj.split_by_column() == {}
Exemplo n.º 3
0
    def test_from_string_ok(self, s, expected):
        all_types = {"sö": pa.string(), "bö": pa.bool_(), "iö": pa.int16()}
        actual = Conjunction.from_string(s, all_types)
        assert actual == expected

        s2 = str(actual)
        actual2 = Conjunction.from_string(s2, all_types)
        assert actual2 == actual
Exemplo n.º 4
0
 def test_filter_df_empty(self):
     cond = Conjunction([])
     df = pd.DataFrame({
         "foö": [13, 42, 42, 100],
         "bar": [1, 2, 3, 4],
         "z": 0.0
     })
     df_actual = cond.filter_df(df)
     pdt.assert_frame_equal(df_actual, df)
Exemplo n.º 5
0
def test_stresstest_index_select_row(driver, function_store):
    n_indices = 100
    n_rows = 1000

    data = {"x": np.arange(n_rows), "p": 0}
    for i in range(n_indices):
        data["i{}".format(i)] = np.arange(n_rows)
    df = pd.DataFrame(data)

    cube = Cube(
        dimension_columns=["x"],
        partition_columns=["p"],
        uuid_prefix="cube",
        index_columns=["i{}".format(i) for i in range(n_indices)],
    )

    build_cube(data=df, cube=cube, store=function_store)

    conditions = Conjunction([(C("i{}".format(i)) == 0) for i in range(n_indices)])

    result = driver(
        cube=cube,
        store=function_store,
        conditions=conditions,
        payload_columns=["p", "x"],
    )
    assert len(result) == 1
    df_actual = result[0]
    df_expected = df.loc[df["x"] == 0].reindex(columns=["p", "x"])
    pdt.assert_frame_equal(df_actual, df_expected)
Exemplo n.º 6
0
def _ask_conditions(conditions, all_columns, all_types):
    txt = prompt(
        message="Conditions: ",
        history=_history_conditions,
        default=str(conditions) if conditions is not None else "",
        completer=WordCompleter(sorted(all_columns)),
        validator=_ValidatorFromParse(
            partial(Conjunction.from_string, all_types=all_types)),
    )
    return Conjunction.from_string(txt, all_types)
Exemplo n.º 7
0
def _dermine_load_columns(cube, datasets, intention):
    """
    Determine which columns to load from given datasets.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Available datasets.
    intention: kartothek.io_components.cube.query._intention.QueryIntention
        Query intention.

    Returns
    -------
    load_columns: Dict[str, Set[str]]
        Columns to load.
    """
    result = {}
    for ktk_cube_dataset_id, ds in datasets.items():
        is_seed = ktk_cube_dataset_id == cube.seed_dataset
        ds_cols = get_dataset_columns(ds)
        dimensionality = ds_cols & set(cube.dimension_columns)
        is_projection = not dimensionality.issubset(
            set(intention.dimension_columns))

        mask = (set(intention.output_columns)
                | set(intention.dimension_columns)
                | intention.conditions_post.get(ktk_cube_dataset_id,
                                                Conjunction([])).columns)
        if not is_seed:
            # optimize load routine by only restore partition columns for seed
            mask -= set(cube.partition_columns)

        candidates = ds_cols & mask
        payload = candidates - set(cube.partition_columns) - set(
            cube.dimension_columns)
        payload_requested = len(payload) > 0

        if is_seed or payload_requested:
            if is_projection and payload_requested:
                raise ValueError((
                    'Cannot project dataset "{ktk_cube_dataset_id}" with dimensionality [{dimensionality}] to '
                    "[{dimension_columns}] while keeping the following payload intact: {payload}"
                ).format(
                    ktk_cube_dataset_id=ktk_cube_dataset_id,
                    dimensionality=", ".join(sorted(dimensionality)),
                    dimension_columns=", ".join(
                        sorted(intention.dimension_columns)),
                    payload=", ".join(sorted(payload)),
                ))

            result[ktk_cube_dataset_id] = candidates
    return result
Exemplo n.º 8
0
def apply_condition_unsafe(df, cond):
    # For the sparse_outer testset, the test_df has the wrong datatype because we cannot encode missing integer data in
    # pandas.
    #
    # The condition will not be applicable to the DF because the DF has floats while conditions have ints. We fix that
    # by modifying the the condition.
    #
    # In case there is no missing data because of the right conditions, kartothek will return integer data.
    # assert_frame_equal will then complain about this. So in case there is no missing data, let's recover the correct
    # dtype here.

    if not isinstance(cond, Conjunction):
        cond = Conjunction(cond)

    float_cols = {col for col in df.columns if df[col].dtype == float}

    # convert int to float conditions
    cond2 = Conjunction([])
    for col, conj in cond.split_by_column().items():
        if col in float_cols:
            parts = []
            for part in conj.conditions:
                if isinstance(part, IsInCondition):
                    part = IsInCondition(column=part.column,
                                         value=tuple(
                                             (float(v) for v in part.value)))
                elif isinstance(part, InIntervalCondition):
                    part = InIntervalCondition(
                        column=part.column,
                        start=float(part.start),
                        stop=float(part.stop),
                    )
                else:
                    part = part.__class__(column=part.column,
                                          value=float(part.value))
                parts.append(part)
            conj = Conjunction(parts)
        cond2 &= conj

    # apply conditions
    df = cond2.filter_df(df).reset_index(drop=True)

    # convert float columns to int columns
    for col in df.columns:
        if df[col].notnull().all():
            dtype = df[col].dtype
            if dtype == np.float64:
                dtype = np.int64
            elif dtype == np.float32:
                dtype = np.int32
            elif dtype == np.float16:
                dtype = np.int16

            df[col] = df[col].astype(dtype)

    return df
Exemplo n.º 9
0
def _determine_restrictive_dataset_ids(cube, datasets, intention):
    """
    Determine which datasets are restrictive.

    These are datasets which contain non-dimension columns and non-partition columns to which users wishes to apply
    restrictions (via conditions or via partition-by).

    Parameters
    ----------
    cube: Cube
        Cube specification.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Available datasets.
    intention: kartothek.io_components.cube.query._intention.QueryIntention
        Query intention.

    Returns
    -------
    restrictive_dataset_ids: Set[str]
        Set of restrictive datasets (by Ktk_cube dataset ID).
    """
    result = set()
    for ktk_cube_dataset_id, dataset in datasets.items():
        if ktk_cube_dataset_id == cube.seed_dataset:
            continue

        mask = (set(intention.partition_by)
                | intention.conditions_pre.get(ktk_cube_dataset_id,
                                               Conjunction([])).columns
                | intention.conditions_post.get(
                    ktk_cube_dataset_id, Conjunction([])).columns) - (set(
                        cube.dimension_columns) | set(cube.partition_columns))
        overlap = mask & get_dataset_columns(dataset)
        if overlap:
            result.add(ktk_cube_dataset_id)

    return result
Exemplo n.º 10
0
 def test_multicol(self):
     col1 = C("foö")
     col2 = C("bar")
     cond1 = col1 < 10
     cond2 = col1 > 0
     cond3 = col2 != 10
     conj1 = cond1 & cond2
     conj2 = conj1 & cond3
     assert isinstance(conj2, Conjunction)
     assert conj2.conditions == (cond1, cond2, cond3)
     assert str(conj2) == "(foö < 10) & (foö > 0) & (bar != 10)"
     assert conj2.columns == {"foö", "bar"}
     assert conj2.predicate == [
         ("foö", "<", 10),
         ("foö", ">", 0),
         ("bar", "!=", 10),
     ]
     assert conj2.split_by_column() == {
         "foö": conj1,
         "bar": Conjunction([cond3])
     }
Exemplo n.º 11
0
 def test_fails(self):
     with pytest.raises(
             TypeError,
             match="Can only build conjunction out of conditions."):
         Conjunction(1)
Exemplo n.º 12
0
class TestConjunction:
    def test_simple(self):
        col = C("foö")
        cond1 = col < 10
        cond2 = col > 0
        conj = cond1 & cond2
        assert isinstance(conj, Conjunction)
        assert conj.conditions == (cond1, cond2)
        assert str(conj) == "(foö < 10) & (foö > 0)"
        assert conj.columns == {"foö"}
        assert conj.predicate == [("foö", "<", 10), ("foö", ">", 0)]
        assert conj.split_by_column() == {"foö": conj}

    def test_nested_conj_cond(self):
        col = C("foö")
        cond1 = col < 10
        cond2 = col > 0
        cond3 = col != 10
        conj1 = cond1 & cond2
        conj2 = conj1 & cond3
        assert isinstance(conj2, Conjunction)
        assert conj2.conditions == (cond1, cond2, cond3)
        assert str(conj2) == "(foö < 10) & (foö > 0) & (foö != 10)"
        assert conj2.columns == {"foö"}
        assert conj2.predicate == [
            ("foö", "<", 10),
            ("foö", ">", 0),
            ("foö", "!=", 10),
        ]
        assert conj2.split_by_column() == {"foö": conj2}

    def test_nested_cond_conj(self):
        col = C("foö")
        cond1 = col < 10
        cond2 = col > 0
        cond3 = col != 10
        conj1 = cond2 & cond3
        conj2 = cond1 & conj1
        assert isinstance(conj2, Conjunction)
        assert conj2.conditions == (cond1, cond2, cond3)

    def test_nested_conj_conj(self):
        col = C("foö")
        cond1 = col < 10
        cond2 = col > 0
        cond3 = col != 10
        cond4 = col != 11
        conj1 = cond1 & cond2
        conj2 = cond3 & cond4
        conj3 = conj1 & conj2
        assert isinstance(conj3, Conjunction)
        assert conj3.conditions == (cond1, cond2, cond3, cond4)

    def test_fails_nocond(self):
        col = C("foö")
        cond1 = col < 10
        with pytest.raises(TypeError) as exc:
            cond1 & col
        assert str(
            exc.value) == "Can only build conjunction out of conditions."

    def test_multicol(self):
        col1 = C("foö")
        col2 = C("bar")
        cond1 = col1 < 10
        cond2 = col1 > 0
        cond3 = col2 != 10
        conj1 = cond1 & cond2
        conj2 = conj1 & cond3
        assert isinstance(conj2, Conjunction)
        assert conj2.conditions == (cond1, cond2, cond3)
        assert str(conj2) == "(foö < 10) & (foö > 0) & (bar != 10)"
        assert conj2.columns == {"foö", "bar"}
        assert conj2.predicate == [
            ("foö", "<", 10),
            ("foö", ">", 0),
            ("bar", "!=", 10),
        ]
        assert conj2.split_by_column() == {
            "foö": conj1,
            "bar": Conjunction([cond3])
        }

    def test_empty_real(self):
        conj = Conjunction([])
        assert conj.conditions == ()
        assert str(conj) == ""
        assert conj.columns == set()
        assert conj.predicate is None
        assert conj.split_by_column() == {}

    def test_empty_pseudo(self):
        cond = InIntervalCondition("x")
        conj = Conjunction([cond])
        assert conj.conditions == (cond, )
        assert str(conj) == "(x.in_interval(None, None))"
        assert conj.columns == set()
        assert conj.predicate is None
        assert conj.split_by_column() == {}

    def test_filter_df_some(self):
        cond = (C("foö") == 42) & (C("bar") == 2)
        df = pd.DataFrame({
            "foö": [13, 42, 42, 100],
            "bar": [1, 2, 3, 4],
            "z": 0.0
        })
        df_actual = cond.filter_df(df)
        df_expected = df.loc[(df["foö"] == 42) & (df["bar"] == 2)]
        pdt.assert_frame_equal(df_actual, df_expected)

    def test_filter_df_empty(self):
        cond = Conjunction([])
        df = pd.DataFrame({
            "foö": [13, 42, 42, 100],
            "bar": [1, 2, 3, 4],
            "z": 0.0
        })
        df_actual = cond.filter_df(df)
        pdt.assert_frame_equal(df_actual, df)

    def test_filter_df_nulls(self):
        cond = (C("foö") != 42.0) & (C("bar") != 2.0)
        df = pd.DataFrame({
            "foö": [13, 42, np.nan, np.nan],
            "bar": [1, 2, 3, np.nan],
            "z": np.nan
        })
        df_actual = cond.filter_df(df)
        df_expected = pd.DataFrame({
            "foö": [13.0],
            "bar": [1.0],
            "z": [np.nan]
        })
        pdt.assert_frame_equal(df_actual, df_expected)

    def test_hash(self):
        col = C("foö")
        cond1 = col < 10
        cond2 = col > 0
        cond3 = col != 10
        conj1a = cond1 & cond2
        conj1b = cond1 & cond2
        conj2 = cond1 & cond3
        assert hash(conj1a) == hash(conj1b)
        assert hash(conj1a) != hash(conj2)

    @pytest.mark.parametrize(
        "s,expected",
        [
            ("sö == a", Conjunction([C("sö") == "a"])),
            ("sö == a & iö < 10", Conjunction([C("sö") == "a",
                                               C("iö") < 10])),
            ("(sö == a) & (iö < 10)",
             Conjunction([C("sö") == "a", C("iö") < 10])),
            ("", Conjunction([])),
            ("  ", Conjunction([])),
        ],
    )
    def test_from_string_ok(self, s, expected):
        all_types = {"sö": pa.string(), "bö": pa.bool_(), "iö": pa.int16()}
        actual = Conjunction.from_string(s, all_types)
        assert actual == expected

        s2 = str(actual)
        actual2 = Conjunction.from_string(s2, all_types)
        assert actual2 == actual

    @pytest.mark.parametrize(
        "obj,expected",
        [
            (
                # obj
                C("foö") > 1,
                # expected
                Conjunction([C("foö") > 1]),
            ),
            (
                # obj
                [C("foö") > 1],
                # expected
                Conjunction([C("foö") > 1]),
            ),
            (
                # obj
                [C("foö") > 1, C("bar") < 1],
                # expected
                Conjunction([C("foö") > 1, C("bar") < 1]),
            ),
            (
                # obj
                Conjunction([C("foö") > 1, C("bar") < 1]),
                # expected
                Conjunction([C("foö") > 1, C("bar") < 1]),
            ),
            (
                # obj
                None,
                # expected
                Conjunction([]),
            ),
        ],
    )
    def test_init_from_obj(self, obj, expected):
        actual = Conjunction(obj)
        assert actual == expected

    def test_fails(self):
        with pytest.raises(
                TypeError,
                match="Can only build conjunction out of conditions."):
            Conjunction(1)

    def test_json_serialization_ok(self):
        conj = Conjunction([
            EqualityCondition(column="foö", value=1.2),
            GreaterEqualCondition(column="foö", value=1.2),
            GreaterThanCondition(column="foö", value=1.2),
            InequalityCondition(column="foö", value=1.2),
            LessEqualCondition(column="foö", value=1.2),
            LessThanCondition(column="foö", value=1.2),
            InIntervalCondition(column="foö", start=1.2, stop=2.3),
            IsInCondition(column="foö", value=[1.2, 1.3]),
        ])

        array_actual = conj.to_jsonarray()
        array_expected = [
            {
                "type": "EqualityCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "GreaterEqualCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "GreaterThanCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "InequalityCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "LessEqualCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "LessThanCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "InIntervalCondition",
                "column": "foö",
                "start": 1.2,
                "stop": 2.3
            },
            {
                "type": "IsInCondition",
                "column": "foö",
                "value": [1.2, 1.3]
            },
        ]
        assert array_actual == array_expected

        conj2 = Conjunction.from_jsonarray(array_actual)
        assert conj2 == conj

        # input not altered
        assert array_actual == array_expected

    @pytest.mark.parametrize(
        "array",
        [
            [{
                "type": "str"
            }],
            [{
                "type": "Condition"
            }],
            [{
                "type": "C"
            }],
            [{
                "type": "Conjunction"
            }],
            [{
                "type": "SimpleCondition"
            }],
            [{
                "type": "VirtualColumn"
            }],
            [{
                "type": "FooBar"
            }],
            [{
                "type": ""
            }],
            [{
                "type": " "
            }],
        ],
    )
    def test_json_serialization_fail_type(self, array):
        with pytest.raises(TypeError, match="Unknown condition class"):
            Conjunction.from_jsonarray(array)

    def test_json_serialization_fail_no_list(self):
        with pytest.raises(TypeError, match="jsonarray must be a list"):
            Conjunction.from_jsonarray({})

    def test_json_serialization_fail_no_cond_dict(self):
        with pytest.raises(TypeError,
                           match="Condition in jsonarray must be a dict"):
            Conjunction.from_jsonarray([1])

    def test_json_serialization_fail_type_missing(self):
        with pytest.raises(ValueError,
                           match="Missing type value for condition"):
            Conjunction.from_jsonarray([{}])
Exemplo n.º 13
0
def prepare_metapartitions_for_removal_action(cube, store, conditions,
                                              ktk_cube_dataset_ids,
                                              existing_datasets):
    """
    Prepare MetaPartition to express removal of given data range from cube.

    The MetaPartition must still be written using ``mp.store_dataframes(...)`` and added to the Dataset using a
    kartothek update method.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube spec.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied, optional. Defaults to "entire cube".
    ktk_cube_dataset_ids: Optional[Union[Iterable[str], str]]
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Existing datasets.

    Returns
    -------
    metapartitions: Dict[str, Tuple[kartothek.core.dataset.DatasetMetadata,
            kartothek.io_components.metapartition.MetaPartition, List[Dict[str, Any]]]]
        MetaPartitions that should be written and updatet to the kartothek datasets as well as the ``delete_scope`` for
        kartothek.
    """
    conditions = Conjunction(conditions)
    conditions_split = conditions.split_by_column()
    if set(conditions_split.keys()) - set(cube.partition_columns):
        raise ValueError(
            "Can only remove partitions with conditions concerning cubes physical partition columns."
        )

    ktk_cube_dataset_ids = converter_str_set_optional(ktk_cube_dataset_ids)
    if ktk_cube_dataset_ids is not None:
        unknown_dataset_ids = ktk_cube_dataset_ids - set(
            existing_datasets.keys())
        if unknown_dataset_ids:
            raise ValueError("Unknown ktk_cube_dataset_ids: {}".format(
                ", ".join(sorted(unknown_dataset_ids))))
    else:
        ktk_cube_dataset_ids = set(existing_datasets.keys())

    metapartitions = {}
    for ktk_cube_dataset_id in ktk_cube_dataset_ids:
        ds = existing_datasets[ktk_cube_dataset_id]
        ds = ds.load_partition_indices()
        mp = _prepare_mp_empty(ds)

        if not ds.partition_keys:
            # no partition keys --> delete all
            delete_scope = [{}]
        else:

            df_partitions = get_partition_dataframe(dataset=ds, cube=cube)
            df_partitions = df_partitions.drop_duplicates()
            local_condition = reduce(
                lambda a, b: a & b,
                (cond for col, cond in conditions_split.items()
                 if col in df_partitions.columns),
                Conjunction([]),
            )
            df_partitions = local_condition.filter_df(df_partitions)

            delete_scope = df_partitions.to_dict(orient="records")

        metapartitions[ktk_cube_dataset_id] = (ds, mp, delete_scope)

    return metapartitions
Exemplo n.º 14
0
def _process_conditions(
    conditions, cube, datasets, all_available_columns, indexed_columns
):
    """
    Process and check given query conditions.

    Parameters
    ----------
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied.
    cube: Cube
        Cube specification.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are present.
    all_available_columns: Set[str]
        All columns that are available for query.
    indexed_columns: Dict[str, Set[str]]
        Indexed columns per ktk_cube dataset ID.

    Returns
    -------
    conditions_pre: Dict[str, kartothek.core.cube.conditions.Conjunction]
        Conditions to be applied based on the index data alone.
    conditions_post: Dict[str, kartothek.core.cube.conditions.Conjunction]
        Conditions to be applied during the load process.

    Raises
    -------
    TypeError: In case of a wrong type.
    """
    conditions = Conjunction(conditions)

    condition_columns = conditions.columns
    missing = condition_columns - all_available_columns
    if missing:
        raise ValueError(
            "Following condition columns are required but are missing from the cube: {missing}".format(
                missing=", ".join(sorted(missing))
            )
        )
    _test_condition_types(conditions, datasets)

    conditions_split = conditions.split_by_column()

    conditions_pre = {}
    for ktk_cube_dataset_id, ds in datasets.items():
        candidate_cols = indexed_columns[ktk_cube_dataset_id]
        if not candidate_cols:
            continue

        filtered = [
            conj for col, conj in conditions_split.items() if col in candidate_cols
        ]
        if not filtered:
            continue

        conditions_pre[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered)

    conditions_post = {}
    for ktk_cube_dataset_id, ds in datasets.items():
        candidate_cols = (get_dataset_columns(ds) & condition_columns) - set(
            cube.partition_columns
        )
        if not candidate_cols:
            continue

        filtered = [
            conj for col, conj in conditions_split.items() if col in candidate_cols
        ]
        if not filtered:
            continue

        conditions_post[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered)

    return conditions_pre, conditions_post
Exemplo n.º 15
0
 def test_json_serialization_fail_no_cond_dict(self):
     with pytest.raises(TypeError,
                        match="Condition in jsonarray must be a dict"):
         Conjunction.from_jsonarray([1])
Exemplo n.º 16
0
 def test_json_serialization_fail_type_missing(self):
     with pytest.raises(ValueError,
                        match="Missing type value for condition"):
         Conjunction.from_jsonarray([{}])
Exemplo n.º 17
0
 def test_json_serialization_fail_no_list(self):
     with pytest.raises(TypeError, match="jsonarray must be a list"):
         Conjunction.from_jsonarray({})
Exemplo n.º 18
0
 def test_json_serialization_fail_type(self, array):
     with pytest.raises(TypeError, match="Unknown condition class"):
         Conjunction.from_jsonarray(array)
Exemplo n.º 19
0
        C("v1") >= 7,
        C("v1") >= 10000,
        C("v2") >= 7,
        C("v3") >= 3,
        C("i1") >= 7,
        C("i1") >= 10000,
        C("i2") >= 7,
        C("i2") != 0,
        C("i3") >= 3,
        C("p") >= 1,
        C("q") >= 1,
        C("x") >= 1,
        C("y") >= 1,
        (C("x") == 3) & (C("y") == 3),
        (C("i1") > 0) & (C("i2") > 0),
        Conjunction([]),
    ],
)
def test_condition(driver, module_store, test_cube, test_df, cond):
    result = driver(cube=test_cube, store=module_store, conditions=cond)

    df_expected = apply_condition_unsafe(test_df, cond)

    if df_expected.empty:
        assert len(result) == 0
    else:
        assert len(result) == 1
        df_actual = result[0]
        pdt.assert_frame_equal(df_actual, df_expected)

Exemplo n.º 20
0
def _create_aligned_partition_df(datasets, cube, intention, indexed_columns,
                                 restrictive_dataset_ids):
    """
    Create DataFrame w/ aligned partitions.

    The output will have a single row per partition that shares the same physical partition and the same partition-by
    attributes. For this, the following columns are present:

    - ``'__ktk_cube_labels_<ktk_cube dataset ID>'``: a column per dataset w/ either NULL or a list of labels that belong to the
      partition entry
    - physical partition columns
    - additional partition-by columns

    Parameters
    ----------
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are processed by the regrouper.
    cube: Cube
        Cube specification.
    intention: kartothek.io_components.cube.query._intention.QueryIntention
        Query intention.
    indexed_columns: Dict[str, Set[str]]
        Indexed columns per ktk_cube dataset ID.
    restrictive_dataset_ids: Set[str]
        Datasets (by Ktk_cube dataset ID) that are restrictive during the join process.

    Returns
    -------
    df_aligned: pandas.DataFrame
        Aligned partitions-DF.
    """
    # Stage 1: Partition DataFrames per Dataset.
    #
    # These DataFrames are classified in 3 categories:
    # - seed: seed dataset
    # - restrict: conditions are applied (therefore data must be present)
    # - other: not a seed and w/o any condition
    df_seed = None
    dfs_restrict = []
    dfs_other = []

    for ktk_cube_dataset_id, ds in datasets.items():
        preconditions = intention.conditions_pre.get(ktk_cube_dataset_id,
                                                     Conjunction([]))
        local_partition_by = sorted(indexed_columns[ktk_cube_dataset_id]
                                    & set(intention.partition_by))
        df = _create_dataset_df(
            preconditions=preconditions,
            ktk_cube_dataset_id=ktk_cube_dataset_id,
            ds=ds,
            cube=cube,
            local_partition_by=local_partition_by,
        )

        # categorize
        if ktk_cube_dataset_id == cube.seed_dataset:
            assert df_seed is None
            df_seed = df
        elif ktk_cube_dataset_id in restrictive_dataset_ids:
            dfs_restrict.append(df)
        else:
            dfs_other.append(df)

    # Stage 2: Alignment
    #
    # Partition DataFrames are aligned based on Cube.partition_columns and their category.
    assert df_seed is not None
    df_aligned = df_seed
    for df_join in dfs_restrict:
        df_aligned = merge_dataframes_robust(df_aligned, df_join, how="inner")
    for df_join in dfs_other:
        df_aligned = merge_dataframes_robust(df_aligned, df_join, how="left")

    return df_aligned
Exemplo n.º 21
0
 def test_init_from_obj(self, obj, expected):
     actual = Conjunction(obj)
     assert actual == expected
Exemplo n.º 22
0
def _regroup(df_aligned, intention, indexed_columns, datasets, cube):
    """
    Based on partition_by, form query groups.

    .. important::
        If tine intention does not contain a partition-by, this partition by the cube partition columns to speed up the
        query on parallel backends. In that case, the backend must concat and check the resulting dataframes before
        passing it to the user.

    Parameters
    ----------
    df_aligned: pandas.DataFrame
        aligned DataFrame, taken from :meth:`_create_aligned_partition_df`
    intention: kartothek.io_components.cube.query._intention.QueryIntention
        Query intention.
    indexed_columns: Dict[str, Set[str]]
        Indexed columns per ktk_cube dataset ID.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are processed by the regrouper.
    cube: Cube
        Cube specification.

    Returns
    -------
    label2gp: Dict[str, Dict[str, Tuple[int, int]]]
        Maps "dataset ID -> (label -> (group ID, partition ID))".
    group2cond: Dict[int, kartothek.core.cube.conditions.Conjunction]
        Condition per group.
    """
    partition_by = intention.partition_by
    if not partition_by:
        # special code to speed up the query
        partition_by = cube.partition_columns

    label2gp = defaultdict(lambda: defaultdict(list))
    group2cond = {}
    # figure out which datasets are affected by which additional condition
    extra_conditions_target = {}
    for ktk_cube_dataset_id, cols in indexed_columns.items():
        if ktk_cube_dataset_id not in datasets:
            # may be irrelevant
            continue
        for col in cols & set(partition_by):
            extra_conditions_target[col] = ktk_cube_dataset_id

    # generate groups
    for g, df_g in df_aligned.groupby(list(partition_by), sort=True):
        gid = g
        if len(partition_by) == 1:
            g = (g, )

        conditions_g = copy(intention.conditions_post)
        for g_part, col in zip(g, partition_by):
            if col in cube.partition_columns:
                # we do not need predicate pushdown for physical partition columns
                continue

            ktk_cube_dataset_id = extra_conditions_target[col]
            conditions_g[ktk_cube_dataset_id] = conditions_g.get(
                ktk_cube_dataset_id, Conjunction([])) & (C(col) == g_part)

        _aligned_df_to_label2gp(df_g, datasets, gid, label2gp)
        group2cond[gid] = conditions_g

    return label2gp, group2cond
Exemplo n.º 23
0
    def test_json_serialization_ok(self):
        conj = Conjunction([
            EqualityCondition(column="foö", value=1.2),
            GreaterEqualCondition(column="foö", value=1.2),
            GreaterThanCondition(column="foö", value=1.2),
            InequalityCondition(column="foö", value=1.2),
            LessEqualCondition(column="foö", value=1.2),
            LessThanCondition(column="foö", value=1.2),
            InIntervalCondition(column="foö", start=1.2, stop=2.3),
            IsInCondition(column="foö", value=[1.2, 1.3]),
        ])

        array_actual = conj.to_jsonarray()
        array_expected = [
            {
                "type": "EqualityCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "GreaterEqualCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "GreaterThanCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "InequalityCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "LessEqualCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "LessThanCondition",
                "column": "foö",
                "value": 1.2
            },
            {
                "type": "InIntervalCondition",
                "column": "foö",
                "start": 1.2,
                "stop": 2.3
            },
            {
                "type": "IsInCondition",
                "column": "foö",
                "value": [1.2, 1.3]
            },
        ]
        assert array_actual == array_expected

        conj2 = Conjunction.from_jsonarray(array_actual)
        assert conj2 == conj

        # input not altered
        assert array_actual == array_expected