Пример #1
0
        def process_column(exp: Expression) -> Expression:
            match = matcher.match(exp)

            if match:
                inner = replace(exp, alias=None)
                return FunctionCallExpr(
                    exp.alias,
                    "if",
                    (
                        binary_condition(
                            ConditionFunctions.IN,
                            inner,
                            literals_tuple(
                                None,
                                [
                                    LiteralExpr(None, "1"),
                                    LiteralExpr(None, "True")
                                ],
                            ),
                        ),
                        LiteralExpr(None, "True"),
                        LiteralExpr(None, "False"),
                    ),
                )

            return exp
Пример #2
0
def test_not_in_condition() -> None:
    not_in_condition = binary_condition(
        ConditionFunctions.NOT_IN,
        Column(None, None, "tags_key"),
        literals_tuple(None, [Literal(None, "t1"), Literal(None, "t2")]),
    )
    assert is_not_in_condition(not_in_condition)

    match = is_not_in_condition_pattern(ColumnPattern(None, String("tags_key"))).match(
        not_in_condition
    )
    assert match is not None
    assert match.expression("tuple") == literals_tuple(
        None, [Literal(None, "t1"), Literal(None, "t2")]
    )
    assert match.expression("lhs") == Column(None, None, "tags_key")
Пример #3
0
def test_processing_functions() -> None:
    in_condition = binary_condition(
        None,
        ConditionFunctions.IN,
        Column(None, "tag_keys", None),
        literals_tuple(
            None,
            [Literal(None, "t1"), Literal(None, "t2")]),
    )
    assert is_in_condition(in_condition)

    eq_condition = binary_condition(None, ConditionFunctions.EQ,
                                    Column(None, "test", None),
                                    Literal(None, "1"))
    assert is_binary_condition(eq_condition, ConditionFunctions.EQ)
    assert not is_binary_condition(eq_condition, ConditionFunctions.NEQ)
Пример #4
0
 def replace_exp(exp: Expression) -> Expression:
     if matcher.match(exp) is not None:
         inner = replace(exp, alias=None)
         return FunctionCallExpr(
             exp.alias,
             "multiIf",
             (
                 binary_condition(ConditionFunctions.EQ, inner,
                                  Literal(None, "")),
                 Literal(None, ""),
                 binary_condition(
                     ConditionFunctions.IN,
                     inner,
                     literals_tuple(
                         None,
                         [Literal(None, "1"),
                          Literal(None, "True")]),
                 ),
                 Literal(None, "True"),
                 Literal(None, "False"),
             ),
         )
     return exp
Пример #5
0
def some_tuple(alias: str | None):
    return literals_tuple(alias,
                          [Literal(None, "duration"),
                           Literal(None, 300)])
Пример #6
0
def __set_condition(alias: Optional[str], function: str, lhs: Expression,
                    rhs: Sequence[Literal]) -> Expression:
    return binary_condition(alias, function, lhs, literals_tuple(None, rhs))
Пример #7
0
def test_events_boolean_context() -> None:
    columns = ColumnSet(
        [("contexts", Nested([("key", String()), ("value", String())]))]
    )
    query = ClickhouseQuery(
        Table("errors", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "device.charging"),
                            ),
                        ),
                    ),
                ),
            )
        ],
    )

    expected = ClickhouseQuery(
        Table("errors", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "if",
                    (
                        binary_condition(
                            ConditionFunctions.IN,
                            FunctionCall(
                                None,
                                "arrayElement",
                                (
                                    Column(None, None, "contexts.value"),
                                    FunctionCall(
                                        None,
                                        "indexOf",
                                        (
                                            Column(None, None, "contexts.key"),
                                            Literal(None, "device.charging"),
                                        ),
                                    ),
                                ),
                            ),
                            literals_tuple(
                                None, [Literal(None, "1"), Literal(None, "True")]
                            ),
                        ),
                        Literal(None, "True"),
                        Literal(None, "False"),
                    ),
                ),
            )
        ],
    )

    settings = HTTPQuerySettings()
    EventsBooleanContextsProcessor().process_query(query, settings)

    assert query.get_selected_columns() == expected.get_selected_columns()
Пример #8
0
def test_events_promoted_boolean_context() -> None:
    columns = ColumnSet(
        [
            ("device_charging", UInt(8, Modifier(nullable=True))),
            ("contexts", Nested([("key", String()), ("value", String())])),
        ]
    )
    query = ClickhouseQuery(
        Table("events", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "device.charging"),
                            ),
                        ),
                    ),
                ),
            )
        ],
    )

    expected = ClickhouseQuery(
        Table("events", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "if",
                    (
                        binary_condition(
                            ConditionFunctions.IN,
                            FunctionCall(
                                None,
                                "toString",
                                (Column(None, None, "device_charging"),),
                            ),
                            literals_tuple(
                                None, [Literal(None, "1"), Literal(None, "True")]
                            ),
                        ),
                        Literal(None, "True"),
                        Literal(None, "False"),
                    ),
                ),
            )
        ],
    )

    settings = HTTPQuerySettings()
    MappingColumnPromoter(
        {"contexts": {"device.charging": "device_charging"}}, cast_to_string=True
    ).process_query(query, settings)
    EventsPromotedBooleanContextsProcessor().process_query(query, settings)

    assert query.get_selected_columns() == expected.get_selected_columns()
Пример #9
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)
Пример #10
0
    def execute(
        self,
        query: Query,
        query_settings: QuerySettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        selected_columns = {
            (col.table_name, col.column_name)
            for col in query.get_columns_referenced_in_select()
        }

        if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS:
            metrics.increment("column_splitter.main_query_min_threshold")
            return None

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(
                self.__id_column,
                ColumnExpr(self.__id_column, None, self.__id_column),
            ),
            SelectedExpression(
                self.__project_column,
                ColumnExpr(self.__project_column, None, self.__project_column),
            ),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(self.__timestamp_column, None,
                           self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT,
        # they fail on distributed tables. For that specific case, skip the query splitter.
        for orderby in minimal_query.get_orderby():
            if isinstance(orderby.expression,
                          (FunctionCallExpr, CurriedFunctionCallExpr)):
                metrics.increment("column_splitter.orderby_has_a_function")
                return None

        result = runner(minimal_query, query_settings)
        del minimal_query

        if not result.result["data"]:
            metrics.increment("column_splitter.no_data_from_minimal_query")
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        query.set_limit(len(result.result["data"]))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, query_settings)
Пример #11
0
def test_events_boolean_context() -> None:
    columns = ColumnSet([
        ("device_charging", Nullable(UInt(8))),
        ("contexts", Nested([("key", String()), ("value", String())])),
    ])
    query = ClickhouseQuery(
        LogicalQuery(
            {},
            TableSource("events", columns),
            selected_columns=[
                SelectedExpression(
                    "contexts[device.charging]",
                    FunctionCall(
                        "contexts[device.charging]",
                        "arrayElement",
                        (
                            Column(None, None, "contexts.value"),
                            FunctionCall(
                                None,
                                "indexOf",
                                (
                                    Column(None, None, "contexts.key"),
                                    Literal(None, "device.charging"),
                                ),
                            ),
                        ),
                    ),
                )
            ],
        ))

    expected = ClickhouseQuery(
        LogicalQuery(
            {},
            TableSource("events", columns),
            selected_columns=[
                SelectedExpression(
                    "contexts[device.charging]",
                    FunctionCall(
                        "contexts[device.charging]",
                        "multiIf",
                        (
                            binary_condition(
                                None,
                                ConditionFunctions.EQ,
                                FunctionCall(
                                    None,
                                    "toString",
                                    (Column(None, None, "device_charging"), ),
                                ),
                                Literal(None, ""),
                            ),
                            Literal(None, ""),
                            binary_condition(
                                None,
                                ConditionFunctions.IN,
                                FunctionCall(
                                    None,
                                    "toString",
                                    (Column(None, None, "device_charging"), ),
                                ),
                                literals_tuple(None, [
                                    Literal(None, "1"),
                                    Literal(None, "True")
                                ]),
                            ),
                            Literal(None, "True"),
                            Literal(None, "False"),
                        ),
                    ),
                )
            ],
        ))

    settings = HTTPRequestSettings()
    MappingColumnPromoter({
        "contexts": {
            "device.charging": "device_charging"
        }
    }).process_query(query, settings)
    EventsBooleanContextsProcessor().process_query(query, settings)

    assert (query.get_selected_columns_from_ast() ==
            expected.get_selected_columns_from_ast())