Exemplo n.º 1
0
def _replace_time_condition(
    query: Union[CompositeQuery[QueryEntity], LogicalQuery]
) -> None:
    condition = query.get_condition()
    top_level = (
        get_first_level_and_conditions(condition) if condition is not None else []
    )
    max_days, date_align = state.get_configs(
        [("max_days", None), ("date_align_seconds", 1)]
    )
    assert isinstance(date_align, int)
    if max_days is not None:
        max_days = int(max_days)

    if isinstance(query, LogicalQuery):
        new_top_level = _align_max_days_date_align(
            query.get_from_clause().key, top_level, max_days, date_align
        )
        query.set_ast_condition(combine_and_conditions(new_top_level))
    else:
        from_clause = query.get_from_clause()
        if not isinstance(from_clause, JoinClause):
            return

        alias_map = from_clause.get_alias_node_map()
        for alias, node in alias_map.items():
            assert isinstance(node.data_source, QueryEntity)  # mypy
            new_top_level = _align_max_days_date_align(
                node.data_source.key, top_level, max_days, date_align, alias
            )
            top_level = new_top_level
            query.set_ast_condition(combine_and_conditions(new_top_level))
Exemplo n.º 2
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition_from_ast()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Exemplo n.º 3
0
    def process_query(self, query: Query, request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (
            self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS
        )
        prewhere_keys = query.get_from_clause().prewhere_candidates
        if not prewhere_keys:
            return

        ast_condition = query.get_condition_from_ast()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS
            and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond)
            )
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [
                (
                    min(
                        prewhere_keys.index(col.column_name)
                        for col in cols
                        if col.column_name in prewhere_keys
                    ),
                    cond,
                )
                for cols, cond in prewhere_candidates
            ],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates][
            :max_prewhere_conditions
        ]

        new_conditions = [
            cond
            for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None
        )
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions) if prewhere_conditions else None
        )
Exemplo n.º 4
0
def test_mand_conditions(table: str, mand_conditions: List[FunctionCall]) -> None:

    query = Query(
        Table(
            table,
            ColumnSet([]),
            final=False,
            sampling_rate=None,
            mandatory_conditions=mand_conditions,
        ),
        None,
        None,
        binary_condition(
            BooleanFunctions.AND,
            binary_condition(
                OPERATOR_TO_FUNCTION["="], Column("d", None, "d"), Literal(None, "1"),
            ),
            binary_condition(
                OPERATOR_TO_FUNCTION["="], Column("c", None, "c"), Literal(None, "3"),
            ),
        ),
    )

    query_ast_copy = copy.deepcopy(query)

    request_settings = HTTPRequestSettings(consistent=True)
    processor = MandatoryConditionApplier()
    processor.process_query(query, request_settings)

    query_ast_copy.add_condition_to_ast(combine_and_conditions(mand_conditions))

    assert query.get_condition_from_ast() == query_ast_copy.get_condition_from_ast()
Exemplo n.º 5
0
    def _update_conditions(self, query: Query,
                           prewhere_conditions: Sequence[Expression]) -> None:
        ast_condition = query.get_condition_from_ast()
        # This should never be None at this point, but for mypy this can be None.
        assert ast_condition is not None

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Exemplo n.º 6
0
def _replace_ast_condition(
    query: Query, field: str, operator: str, new_operand: Expression
) -> None:
    """
    Replaces a condition in the top level AND boolean condition
    in the query WHERE clause.
    """

    def replace_condition(expression: Expression) -> Expression:
        match = FunctionCall(
            String(OPERATOR_TO_FUNCTION[operator]),
            (Param("column", Column(None, String(field))), AnyExpression()),
        ).match(expression)

        return (
            expression
            if match is None
            else replace(
                expression, parameters=(match.expression("column"), new_operand)
            )
        )

    condition = query.get_condition_from_ast()
    if condition is not None:
        query.set_ast_condition(
            combine_and_conditions(
                [
                    replace_condition(c)
                    for c in get_first_level_and_conditions(condition)
                ]
            )
        )
Exemplo n.º 7
0
 def visit_and_expression(
     self, node: Node, visited_children: Tuple[Any, Expression, Any,
                                               Expression]) -> Expression:
     _, left_condition, _, and_condition = visited_children
     args = [left_condition]
     # in the case of one Condition
     # and_condition will be an empty Node
     if isinstance(and_condition, Node):
         return left_condition
     if isinstance(and_condition, (AndTuple, OrTuple)):
         _, exp = and_condition
         return combine_and_conditions([left_condition, exp])
     elif isinstance(and_condition, list):
         for elem in and_condition:
             _, exp = elem
             args.append(exp)
     return combine_and_conditions(args)
Exemplo n.º 8
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:

        mandatory_conditions = query.get_from_clause().mandatory_conditions

        if len(mandatory_conditions) > 0:
            query.add_condition_to_ast(
                combine_and_conditions(mandatory_conditions))
Exemplo n.º 9
0
 def build_query(self) -> ProcessableQuery[Entity]:
     return LogicalQuery(
         from_clause=self.__data_source,
         selected_columns=list(
             sorted(self.__selected_expressions,
                    key=lambda selected: selected.name)),
         condition=combine_and_conditions(self.__conditions)
         if self.__conditions else None,
     )
Exemplo n.º 10
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:

        mandatory_conditions = query.get_data_source(
        ).get_mandatory_conditions()
        query.add_conditions([c.legacy for c in mandatory_conditions])

        if len(mandatory_conditions) > 0:
            query.add_condition_to_ast(
                combine_and_conditions([c.ast for c in mandatory_conditions]))
Exemplo n.º 11
0
def test_failure_rate_format_expressions() -> None:
    unprocessed = Query(
        {},
        TableSource("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression(name=None, expression=Column(None, None, "column2")),
            SelectedExpression("perf", FunctionCall("perf", "failure_rate", ())),
        ],
    )
    expected = Query(
        {},
        TableSource("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression(name=None, expression=Column(None, None, "column2")),
            SelectedExpression(
                "perf",
                divide(
                    FunctionCall(
                        None,
                        "countIf",
                        (
                            combine_and_conditions(
                                [
                                    binary_condition(
                                        None,
                                        ConditionFunctions.NEQ,
                                        Column(None, None, "transaction_status"),
                                        Literal(None, code),
                                    )
                                    for code in [0, 1, 2]
                                ]
                            ),
                        ),
                    ),
                    count(),
                    "perf",
                ),
            ),
        ],
    )

    failure_rate_processor(ColumnSet([])).process_query(
        unprocessed, HTTPRequestSettings()
    )
    assert (
        expected.get_selected_columns_from_ast()
        == unprocessed.get_selected_columns_from_ast()
    )

    ret = unprocessed.get_selected_columns_from_ast()[1].expression.accept(
        ClickhouseExpressionFormatter()
    )
    assert ret == (
        "(divide(countIf(notEquals(transaction_status, 0) AND notEquals(transaction_status, 1) AND notEquals(transaction_status, 2)), count()) AS perf)"
    )
Exemplo n.º 12
0
def filter_expression(
    columns: Expression,
    single_filtered: Dict[LiteralExpr, Sequence[str]],
    multiple_filtered: Dict[Tuple[LiteralExpr, ...], Sequence[Tuple[str,
                                                                    ...]]],
) -> Expression:
    argument_name = "arg"
    argument = Argument(None, argument_name)

    conditions: List[Expression] = []

    for index in single_filtered:
        conditions.append(
            binary_condition(
                ConditionFunctions.IN,
                tupleElement(None, argument, index),
                FunctionCallExpr(
                    None,
                    "tuple",
                    tuple(
                        LiteralExpr(None, f) for f in single_filtered[index]),
                ),
            ))

    for indices in multiple_filtered:
        conditions.append(
            binary_condition(
                ConditionFunctions.IN,
                FunctionCallExpr(
                    None,
                    "tuple",
                    tuple(
                        tupleElement(None, argument, index)
                        for index in indices),
                ),
                FunctionCallExpr(
                    None,
                    "tuple",
                    tuple(
                        FunctionCallExpr(
                            None,
                            "tuple",
                            tuple(LiteralExpr(None, t) for t in tuples),
                        ) for tuples in multiple_filtered[indices]),
                ),
            ))

    return FunctionCallExpr(
        None,
        "arrayFilter",
        (Lambda(None, (argument_name, ),
                combine_and_conditions(conditions)), columns),
    )
Exemplo n.º 13
0
    def add_conditions(
        self,
        timestamp: datetime,
        offset: Optional[int],
        query: Union[CompositeQuery[Entity], Query],
    ) -> None:
        # TODO: Support composite queries with multiple entities.
        from_clause = query.get_from_clause()
        if not isinstance(from_clause, Entity):
            raise InvalidSubscriptionError("Only simple queries are supported")
        entity = get_entity(from_clause.key)
        required_timestamp_column = entity.required_time_column
        if required_timestamp_column is None:
            raise InvalidSubscriptionError(
                "Entity must have a timestamp column for subscriptions")

        conditions_to_add: List[Expression] = [
            binary_condition(
                ConditionFunctions.EQ,
                Column(None, None, "project_id"),
                Literal(None, self.project_id),
            ),
            binary_condition(
                ConditionFunctions.GTE,
                Column(None, None, required_timestamp_column),
                Literal(None, (timestamp - self.time_window)),
            ),
            binary_condition(
                ConditionFunctions.LT,
                Column(None, None, required_timestamp_column),
                Literal(None, timestamp),
            ),
        ]

        if offset is not None:
            conditions_to_add.append(
                binary_condition(
                    ConditionFunctions.LTE,
                    FunctionCall(
                        None,
                        "ifnull",
                        (Column(None, None, "offset"), Literal(None, 0)),
                    ),
                    Literal(None, offset),
                ))

        new_condition = combine_and_conditions(conditions_to_add)
        condition = query.get_condition()
        if condition:
            new_condition = binary_condition(BooleanFunctions.AND, condition,
                                             new_condition)

        query.set_ast_condition(new_condition)
Exemplo n.º 14
0
def test_mand_conditions(table: str,
                         mand_conditions: List[MandatoryCondition]) -> None:

    body = {"conditions": [["d", "=", "1"], ["c", "=", "3"]]}

    query = Query(
        copy.deepcopy(body),
        TableSource(table, None, mand_conditions, ["c1"]),
        None,
        None,
        binary_condition(
            None,
            BooleanFunctions.AND,
            binary_condition(
                None,
                OPERATOR_TO_FUNCTION["="],
                Column("d", None, "d"),
                Literal(None, "1"),
            ),
            binary_condition(
                None,
                OPERATOR_TO_FUNCTION["="],
                Column("c", None, "c"),
                Literal(None, "3"),
            ),
        ),
    )

    query_ast_copy = copy.deepcopy(query)

    request_settings = HTTPRequestSettings(consistent=True)
    processor = MandatoryConditionApplier()
    processor.process_query(query, request_settings)

    body["conditions"].extend([c.legacy for c in mand_conditions])
    assert query.get_conditions() == body["conditions"]

    query_ast_copy.add_condition_to_ast(
        combine_and_conditions([c.ast for c in mand_conditions]))

    assert query.get_condition_from_ast(
    ) == query_ast_copy.get_condition_from_ast()
Exemplo n.º 15
0
    def _replace_time_condition(
        self,
        query: Query,
        from_date: datetime,
        from_exp: FunctionCall,
        to_date: datetime,
        to_exp: FunctionCall,
    ) -> None:
        max_days, date_align = state.get_configs(
            [("max_days", None), ("date_align_seconds", 1)]
        )

        def align_fn(dt: datetime) -> datetime:
            assert isinstance(date_align, int)
            return dt - timedelta(seconds=(dt - dt.min).seconds % date_align)

        from_date, to_date = align_fn(from_date), align_fn(to_date)
        assert from_date <= to_date

        if max_days is not None and (to_date - from_date).days > max_days:
            from_date = to_date - timedelta(days=max_days)

        def replace_cond(exp: Expression) -> Expression:
            if not isinstance(exp, FunctionCall):
                return exp
            elif exp == from_exp:
                return replace(
                    exp, parameters=(from_exp.parameters[0], Literal(None, from_date)),
                )
            elif exp == to_exp:
                return replace(
                    exp, parameters=(to_exp.parameters[0], Literal(None, to_date))
                )

            return exp

        condition = query.get_condition_from_ast()
        top_level = get_first_level_and_conditions(condition) if condition else []
        new_top_level = list(map(replace_cond, top_level))
        query.set_ast_condition(combine_and_conditions(new_top_level))
Exemplo n.º 16
0
def generate_bloom_filter_condition(
    column_name: str,
    single_filtered: Dict[str, Sequence[str]],
    multiple_filtered: Dict[Tuple[str, ...], Sequence[Tuple[str, ...]]],
) -> Optional[Expression]:
    """
    Generate the filters on the array columns to use the bloom filter index on
    the spans.op and spans.group columns in order to filter the transactions
    prior to the array join.

    The bloom filter index is requires the use of the has function, therefore
    the final condition is built up from a series of has conditions.
    """

    per_key_vals: Dict[str, Set[str]] = defaultdict(set)

    for key, single_filter in single_filtered.items():
        for val in single_filter:
            per_key_vals[key].add(val)

    for keys, multiple_filter in multiple_filtered.items():
        for val_tuple in multiple_filter:
            for key, val in zip(keys, val_tuple):
                per_key_vals[key].add(val)

    conditions = [
        combine_or_conditions([
            FunctionCallExpr(
                None,
                "has",
                (ColumnExpr(None, None, key), LiteralExpr(None, val)),
            ) for val in sorted(vals)
        ]) for key, vals in per_key_vals.items()
    ]

    return combine_and_conditions(conditions) if conditions else None
Exemplo n.º 17
0
def generate_subqueries(query: CompositeQuery[Entity]) -> None:
    """
    Generates correct subqueries for each of the entities referenced in
    a join query, and pushes down all expressions that can be executed
    in the subquery.

    Columns in the select clause of the subqueries are referenced
    by providing them a mangled alias that is referenced in the external
    query.

    ```
    SELECT e.a, f(g.b) FROM Events e INNER JOIN Groups g ON ...
    ```

    becomes

    ```
    SELECT e._snuba_a, g._snuba_b
    FROM (
        SELECT a as _snuba_a
        FROM events
    ) e INNER JOIN (
        SELECT f(b) as _snuba_b
        FROM groups
    ) g ON ....
    ```

    Conditions are treated differently compared to other expressions. If
    a condition is entirely contained in a single subquery, we push it
    down entirely in the condition clause of the subquery and remove it
    from the main query entirely.
    """

    from_clause = query.get_from_clause()
    if isinstance(from_clause, CompositeQuery):
        generate_subqueries(from_clause)
        return
    elif isinstance(from_clause, ProcessableQuery):
        return

    # Now this has to be a join, so we can work with it.
    subqueries = from_clause.accept(SubqueriesInitializer())

    alias_generator = _alias_generator()
    query.set_ast_selected_columns([
        SelectedExpression(
            name=s.name,
            expression=_process_root(s.expression, subqueries,
                                     alias_generator),
        ) for s in query.get_selected_columns()
    ])

    array_join = query.get_arrayjoin()
    if array_join is not None:
        query.set_arrayjoin([
            _process_root(el, subqueries, alias_generator) for el in array_join
        ])

    ast_condition = query.get_condition()
    if ast_condition is not None:
        main_conditions = []
        for c in get_first_level_and_conditions(ast_condition):
            subexpression = c.accept(BranchCutter(alias_generator))
            if isinstance(subexpression, SubqueryExpression):
                # The expression is entirely contained in a single subquery
                # after we tried to cut subquery branches with the
                # BranchCutter visitor.
                # so push down the entire condition and remove it from
                # the main query.
                subqueries[subexpression.subquery_alias].add_condition(
                    subexpression.main_expression)
            else:
                # This condition has references to multiple subqueries.
                # We cannot push down the condition. We push down the
                # branches into the select clauses and we reference them
                # from the main query condition.
                main_conditions.append(
                    _push_down_branches(subexpression, subqueries,
                                        alias_generator))

        if main_conditions:
            query.set_ast_condition(combine_and_conditions(main_conditions))
        else:
            query.set_ast_condition(None)

    # TODO: push down the group by when it is the same as the join key.
    query.set_ast_groupby([
        _process_root(e, subqueries, alias_generator)
        for e in query.get_groupby()
    ])

    having = query.get_having()
    if having is not None:
        query.set_ast_having(
            combine_and_conditions([
                _process_root(c, subqueries, alias_generator)
                for c in get_first_level_and_conditions(having)
            ]))

    query.set_ast_orderby([
        replace(
            orderby,
            expression=_process_root(orderby.expression, subqueries,
                                     alias_generator),
        ) for orderby in query.get_orderby()
    ])

    limitby = query.get_limitby()
    if limitby is not None:
        query.set_limitby(
            replace(
                limitby,
                columns=[
                    _process_root(
                        column,
                        subqueries,
                        alias_generator,
                    ) for column in limitby.columns
                ],
            ))

    query.set_from_clause(
        SubqueriesReplacer(subqueries).visit_join_clause(from_clause))
Exemplo n.º 18
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # In case the query is final we cannot simply add any candidate
        # condition to the prewhere.
        # Final is applied after prewhere, so there are cases where moving
        # conditions to the prewhere could exclude from the result sets
        # rows that would be merged under the `final` condition.
        # Example, rewriting the group_id on an unmerge. If the group_id
        # is in the prewhere, final wil fail at merging the rows.
        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Exemplo n.º 19
0
def add_equivalent_conditions(query: CompositeQuery[Entity]) -> None:
    """
    Finds conditions in a join query on columns that have a semantic
    equivalent in another entity in the join and add the same condition
    on the equivalent column.

    Example: In a join between events and groupedmessage, if there is
    a condition on events.project_id, it would replicate the same
    condition on groupedmessage.project_id as this is a semantically
    equivalent column.

    The goal is to reduce the amount of data that is loaded by clickhouse
    for each subquery by adding all the conditions we can to all
    subqueries.

    Cases we skip:
    - top level conditions that include columns in multiple tables.
      These cannot be pushed down to subqueries.
    - top level conditions containing multiple columns as some may
      not have a semantic equivalent. TODO: This can be extended by
      supporting conditions that contain multiple column which all
      have an equivalent in the same entity
    """

    from_clause = query.get_from_clause()
    if isinstance(from_clause, CompositeQuery):
        add_equivalent_conditions(from_clause)
        return
    elif isinstance(from_clause, ProcessableQuery):
        return

    # Now this has to be a join, so we can work with it.

    alias_to_entity = {
        alias: entity_from_node(node)
        for alias, node in from_clause.get_alias_node_map().items()
    }
    entity_to_alias: MutableMapping[EntityKey, Set[str]] = {}
    for alias, entity in alias_to_entity.items():
        entity_to_alias.setdefault(entity, set()).add(alias)

    column_equivalence = get_equivalent_columns(from_clause)
    condition = query.get_condition()
    if condition is None:
        return

    and_components = get_first_level_and_conditions(condition)
    conditions_to_add = []
    for sub_condition in and_components:
        # We duplicate only the top level conditions that reference one
        # and only one column that has a semantic equivalent.
        # This excludes top level conditions that contains columns from
        # multiple entities, and cannot be pushed down to subqueries.
        #
        # TODO: Address top level conditions that contain multiple
        # columns each of which has an equivalent in the same entity.
        sole_column = _classify_single_column_condition(
            sub_condition, alias_to_entity)
        if sole_column is not None:
            column_in_condition, table_alias_in_condition = sole_column

            for equivalent_table_alias in entity_to_alias[
                    column_in_condition.entity]:
                if equivalent_table_alias != table_alias_in_condition:
                    # There are multiple occurrences of the entity found.
                    # Apply the same condition everywhere.
                    replacer = partial(
                        _replace_col,
                        table_alias_in_condition,
                        column_in_condition.column,
                        equivalent_table_alias,
                        column_in_condition.column,
                    )
                    conditions_to_add.append(sub_condition.transform(replacer))

            for equivalent in column_equivalence.get(column_in_condition, []):
                # There are equivalent column on different entities
                # in the query. Transform the condition and add it
                # to all entities.
                equivalent_aliases = entity_to_alias.get(
                    equivalent.entity, set())
                for table_alias in equivalent_aliases:
                    replacer = partial(
                        _replace_col,
                        table_alias_in_condition,
                        column_in_condition.column,
                        table_alias,
                        equivalent.column,
                    )
                    conditions_to_add.append(sub_condition.transform(replacer))

    query.set_ast_condition(
        combine_and_conditions([*and_components, *conditions_to_add]))
Exemplo n.º 20
0
def array_join_col(ops=None, groups=None, op_groups=None):
    conditions: List[Expression] = []

    argument_name = "arg"
    argument = Argument(None, argument_name)

    if ops:
        conditions.append(
            binary_condition(
                ConditionFunctions.IN,
                tupleElement(None, argument, Literal(None, 1)),
                FunctionCall(None, "tuple",
                             tuple(Literal(None, op) for op in ops)),
            ))

    if groups:
        conditions.append(
            binary_condition(
                ConditionFunctions.IN,
                tupleElement(None, argument, Literal(None, 2)),
                FunctionCall(None, "tuple",
                             tuple(Literal(None, group) for group in groups)),
            ))

    if op_groups:
        conditions.append(
            binary_condition(
                ConditionFunctions.IN,
                FunctionCall(
                    None,
                    "tuple",
                    (
                        tupleElement(None, argument, Literal(None, 1)),
                        tupleElement(None, argument, Literal(None, 2)),
                    ),
                ),
                FunctionCall(
                    None,
                    "tuple",
                    tuple(
                        FunctionCall(None, "tuple", (Literal(None, op),
                                                     Literal(None, group)))
                        for op, group in op_groups),
                ),
            ))

    cols = FunctionCall(
        None,
        "arrayMap",
        (
            Lambda(
                None,
                ("x", "y", "z"),
                FunctionCall(
                    None, "tuple",
                    tuple(Argument(None, arg) for arg in ("x", "y", "z"))),
            ),
            Column(None, None, "spans.op"),
            Column(None, None, "spans.group"),
            Column(None, None, "spans.exclusive_time"),
        ),
    )

    if conditions:
        cols = FunctionCall(
            None,
            "arrayFilter",
            (
                Lambda(None,
                       (argument_name, ), combine_and_conditions(conditions)),
                cols,
            ),
        )

    return arrayJoin("snuba_all_spans", cols)
Exemplo n.º 21
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # We remove the candidates that appear in a uniq or -If aggregations
        # because a query like `countIf(col=x) .. PREWHERE col=x` can make
        # the Clickhouse server crash.
        uniq_cols: Set[str] = set()
        expressions = query.get_all_expressions()
        for exp in expressions:
            if isinstance(exp,
                          FunctionCall) and (exp.function_name == "uniq" or
                                             exp.function_name.endswith("If")):
                columns = get_columns_in_expression(exp)
                for c in columns:
                    uniq_cols.add(c.column_name)

        for col in uniq_cols:
            if col in prewhere_keys:
                metrics.increment(
                    "uniq_col_in_prewhere_candidate",
                    tags={
                        "column": col,
                        "referrer": query_settings.referrer
                    },
                )

        prewhere_keys = [key for key in prewhere_keys if key not in uniq_cols]

        # In case the query is final we cannot simply add any candidate
        # condition to the prewhere.
        # Final is applied after prewhere, so there are cases where moving
        # conditions to the prewhere could exclude from the result sets
        # rows that would be merged under the `final` condition.
        # Example, rewriting the group_id on an unmerge. If the group_id
        # is in the prewhere, final wil fail at merging the rows.
        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Exemplo n.º 22
0
                                     None)),
         [
             JoinCondition(
                 JoinConditionExpression("ev", "event_id"),
                 JoinConditionExpression("ev2", "event_id"),
             )
         ],
         JoinType.INNER,
         None,
     ),
     combine_and_conditions([
         binary_condition(
             ConditionFunctions.EQ,
             Column(None, "ev", "event_id"),
             Literal(None, 1),
         ),
         binary_condition(
             ConditionFunctions.EQ,
             Column(None, "ev2", "event_id"),
             Literal(None, 1),
         ),
     ]),
     id="Self join. Duplicate condition",
 ),
 pytest.param(
     binary_condition(ConditionFunctions.EQ, Column(None, "ev",
                                                    "project_id"),
                      Literal(None, 1)),
     ENTITY_GROUP_JOIN,
     combine_and_conditions([
         binary_condition(
             ConditionFunctions.EQ,
Exemplo n.º 23
0
 def and_builder(expressions: Sequence[Expression]) -> Optional[Expression]:
     if not expressions:
         return None
     return combine_and_conditions(expressions)
Exemplo n.º 24
0
    def _get_condition_without_redundant_checks(
        self, condition: Expression, query: Query
    ) -> Expression:
        """Optimizes the case where the query condition contains the following:

        valueOf('my_tag') != '' AND valueOf('my_tag') == "something"
                          ^                            ^
                          |                            |
                      existence check               value check

        the existence check in this clause is redundant and prevents the hashmap
        optimization from being applied.

        This function will remove all tag existence checks
        from the condition IFF they are ANDed with a value check for the *same tag name*

        Side effects:
            This function works by flattening first level AND conditions to find clauses where
            existence checks and value checks are ANDed together. When the AND conditions are recombined,
            they are not guaranteed to be in the same structure (but are guaranteed to be functionally equivalent)

            Example:
                ┌───┐         ┌───┐
                │AND│         │AND│
                ├──┬┘         └┬──┤
                │  │           │  │
             ┌──┴┐ c           a ┌┴──┐
             │AND│    becomes    │AND│
             └┬─┬┘               ├──┬┘
              │ │                │  │
              a b                b  c
        """
        if not isinstance(condition, FunctionExpr):
            return condition
        elif condition.function_name == BooleanFunctions.OR:
            sub_conditions = get_first_level_or_conditions(condition)
            pruned_conditions = [
                self._get_condition_without_redundant_checks(c, query)
                for c in sub_conditions
            ]
            return combine_or_conditions(pruned_conditions)
        elif condition.function_name == BooleanFunctions.AND:
            sub_conditions = get_first_level_and_conditions(condition)
            tag_eq_match_strings = set()
            matched_tag_exists_conditions = {}
            for condition_id, cond in enumerate(sub_conditions):
                tag_exist_match = None
                for tag_exists_pattern in self.__tag_exists_patterns:
                    tag_exist_match = tag_exists_pattern.match(cond)
                    if tag_exist_match:
                        matched_tag_exists_conditions[condition_id] = tag_exist_match
                if not tag_exist_match:
                    eq_match = self.__optimizable_pattern.match(cond)
                    if eq_match:
                        tag_eq_match_strings.add(eq_match.string(KEY_MAPPING_PARAM))
            useful_conditions = []
            for condition_id, cond in enumerate(sub_conditions):
                tag_exist_match = matched_tag_exists_conditions.get(condition_id, None)
                if tag_exist_match:
                    requested_tag = tag_exist_match.string("key")
                    if requested_tag in tag_eq_match_strings:
                        # the clause is redundant, thus we continue the loop
                        # and do not add it to useful_conditions
                        continue
                useful_conditions.append(
                    self._get_condition_without_redundant_checks(cond, query)
                )
            return combine_and_conditions(useful_conditions)
        else:
            return condition