Exemplo n.º 1
0
def test_format_expressions(
    name: str, query: ClickhouseQuery, expected_query: ClickhouseQuery
) -> None:
    MappingColumnPromoter({"tags": {"promoted_tag": "promoted"}}).process_query(
        query, HTTPQuerySettings()
    )

    assert query.get_selected_columns() == expected_query.get_selected_columns()
Exemplo n.º 2
0
def test_hexint_column_processor(unprocessed: Expression,
                                 formatted_value: str) -> None:
    unprocessed_query = Query(
        Table("transactions", ColumnSet([])),
        selected_columns=[
            SelectedExpression("column1", Column(None, None, "column1"))
        ],
        condition=unprocessed,
    )

    HexIntColumnProcessor(set(["column1"
                               ])).process_query(unprocessed_query,
                                                 HTTPQuerySettings())
    assert unprocessed_query.get_selected_columns() == [
        SelectedExpression(
            "column1",
            FunctionCall(
                None,
                "lower",
                (FunctionCall(
                    None,
                    "hex",
                    (Column(None, None, "column1"), ),
                ), ),
            ),
        )
    ]

    condition = unprocessed_query.get_condition()
    assert condition is not None
    ret = condition.accept(ClickhouseExpressionFormatter())
    assert ret == formatted_value
def test_uuid_array_column_processor(
    unprocessed: Expression,
    expected: Expression,
    formatted_value: str,
) -> None:
    unprocessed_query = Query(
        Table("transactions", ColumnSet([])),
        selected_columns=[
            SelectedExpression("column2", Column(None, None, "column2"))
        ],
        condition=unprocessed,
    )
    expected_query = Query(
        Table("transactions", ColumnSet([])),
        selected_columns=[
            SelectedExpression("column2", Column(None, None, "column2"))
        ],
        condition=expected,
    )

    FixedStringArrayColumnProcessor(set(["column1", "column2"]),
                                    32).process_query(unprocessed_query,
                                                      HTTPQuerySettings())
    assert unprocessed_query.get_selected_columns() == [
        SelectedExpression(
            "column2",
            Column(None, None, "column2"),
        )
    ]

    assert expected_query.get_condition() == unprocessed_query.get_condition()
    condition = unprocessed_query.get_condition()
    assert condition is not None
    ret = condition.accept(ClickhouseExpressionFormatter())
    assert ret == formatted_value
Exemplo n.º 4
0
 def query_runner(query: Query, settings: RequestSettings,
                  reader: Reader) -> QueryResult:
     assert query.get_selected_columns() == [
         SelectedExpression(
             "org_id",
             Column("_snuba_org_id", None, "org_id"),
         ),
         SelectedExpression(
             "project_id",
             Column("_snuba_project_id", None, "project_id"),
         ),
         SelectedExpression(
             "tags[10]",
             FunctionCall(
                 "_snuba_tags[10]",
                 "arrayElement",
                 (
                     Column(None, None, "tags.value"),
                     FunctionCall(
                         None,
                         "indexOf",
                         (Column(None, None, "tags.key"), Literal(None,
                                                                  10)),
                     ),
                 ),
             ),
         ),
         SelectedExpression(
             column_name,
             translated_value,
         ),
     ]
     return QueryResult({}, {})
Exemplo n.º 5
0
 def query_runner(query: Query, settings: QuerySettings,
                  reader: Reader) -> QueryResult:
     assert query.get_selected_columns() == [
         SelectedExpression(
             "tags[transaction]",
             Column("_snuba_tags[transaction]", None, "transaction_name"),
         ),
         SelectedExpression(
             "contexts[browser.name]",
             FunctionCall(
                 "_snuba_contexts[browser.name]",
                 "arrayElement",
                 (
                     Column(None, None, "contexts.value"),
                     FunctionCall(
                         None,
                         "indexOf",
                         (
                             Column(None, None, "contexts.key"),
                             Literal(None, "browser.name"),
                         ),
                     ),
                 ),
             ),
         ),
     ]
     return QueryResult({}, {})
def get_filtered_mapping_keys(query: Query, column_name: str) -> Sequence[str]:
    """
    Identifies the conditions we can apply the arrayFilter optimization
    on.
    Which means: if the arrayJoin is in the select clause, there
    are one or more top level AND condition on the arrayJoin and
    there is no OR condition in the query.
    """
    array_join_found = any(
        array_join_pattern(column_name).match(f) is not None
        for selected in query.get_selected_columns() or []
        for f in selected.expression)

    if not array_join_found:
        return list()

    ast_condition = query.get_condition()
    cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name)
                 if ast_condition is not None else set())
    if cond_keys is None:
        # This means we found an OR. Cowardly we give up even though there could
        # be cases where this condition is still optimizable.
        return []

    ast_having = query.get_having()
    having_keys = (_get_mapping_keys_in_condition(ast_having, column_name)
                   if ast_having is not None else set())
    if having_keys is None:
        # Same as above
        return []

    keys = cond_keys | having_keys
    return sorted(list(keys))
Exemplo n.º 7
0
 def process_query(self, query: Query,
                   query_settings: QuerySettings) -> None:
     having_clause = query.get_having()
     if not having_clause:
         return None
     selected_columns = query.get_selected_columns()
     uniq_matcher = Param("function", FunctionCallMatch(String("uniq")))
     found_functions = []
     for exp in having_clause:
         match = uniq_matcher.match(exp)
         if match is not None:
             found_functions.append(match.expression("function"))
     if found_functions is not None:
         matcher = _ExpressionOrAliasMatcher(found_functions)
         for col in selected_columns:
             col.expression.accept(matcher)
         if not all(matcher.found_expressions):
             should_throw = get_config("throw_on_uniq_select_and_having",
                                       False)
             error = MismatchedAggregationException(
                 "Aggregation is in HAVING clause but not SELECT",
                 query=str(query))
             if should_throw:
                 raise error
             else:
                 logging.warning(
                     "Aggregation is in HAVING clause but not SELECT",
                     exc_info=True,
                     extra=cast(Dict[str, Any], error.to_dict()),
                 )
Exemplo n.º 8
0
def test_events_column_format_expressions() -> None:
    unprocessed = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")),
            SelectedExpression("the_group_id",
                               Column("the_group_id", None, "group_id")),
            SelectedExpression("the_message",
                               Column("the_message", None, "message")),
        ],
    )
    expected_query = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")),
            SelectedExpression(
                "the_group_id",
                FunctionCall(
                    "the_group_id",
                    "nullIf",
                    (
                        Column(None, None, "group_id"),
                        Literal(None, 0),
                    ),
                ),
            ),
            SelectedExpression(
                "the_message",
                Column("the_message", None, "message"),
            ),
        ],
    )

    GroupIdColumnProcessor().process_query(unprocessed, HTTPRequestSettings())
    assert expected_query.get_selected_columns(
    ) == unprocessed.get_selected_columns()

    expected = (
        "(nullIf(group_id, 0) AS the_group_id)",
        "(message AS the_message)",
    )

    for idx, column in enumerate(unprocessed.get_selected_columns()[1:]):
        formatted = column.expression.accept(ClickhouseExpressionFormatter())
        assert expected[idx] == formatted
def test_event_id_column_format_expressions() -> None:
    unprocessed = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "transaction.duration",
                Column("transaction.duration", None, "duration")),
            SelectedExpression("the_event_id",
                               Column("the_event_id", None, "event_id")),
        ],
    )
    expected = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "transaction.duration",
                Column("transaction.duration", None, "duration")),
            SelectedExpression(
                "the_event_id",
                FunctionCall(
                    "the_event_id",
                    "replaceAll",
                    (
                        FunctionCall(
                            None,
                            "toString",
                            (Column(None, None, "event_id"), ),
                        ),
                        Literal(None, "-"),
                        Literal(None, ""),
                    ),
                ),
            ),
        ],
    )

    UUIDColumnProcessor({"event_id"}).process_query(unprocessed,
                                                    HTTPQuerySettings())
    assert expected.get_selected_columns() == unprocessed.get_selected_columns(
    )

    formatted = unprocessed.get_selected_columns()[1].expression.accept(
        ClickhouseExpressionFormatter())
    assert formatted == "(replaceAll(toString(event_id), '-', '') AS the_event_id)"
Exemplo n.º 10
0
def test_tags_processor(query_body: MutableMapping[str, Any],
                        expected_query: ClickhouseQuery) -> None:
    """
    Tests the whole processing in some notable cases.
    """
    processed = parse_and_process(query_body)
    assert processed.get_selected_columns(
    ) == expected_query.get_selected_columns()
    assert processed.get_condition() == expected_query.get_condition()
    assert processed.get_having() == expected_query.get_having()
Exemplo n.º 11
0
def test_uuid_array_column_processor(
    unprocessed: Expression,
    expected: Expression,
    formatted_value: str,
) -> None:
    unprocessed_query = Query(
        Table("transactions", ColumnSet([])),
        selected_columns=[
            SelectedExpression("column2", Column(None, None, "column2"))
        ],
        condition=unprocessed,
    )
    expected_query = Query(
        Table("transactions", ColumnSet([])),
        selected_columns=[
            SelectedExpression("column2", Column(None, None, "column2"))
        ],
        condition=expected,
    )

    UUIDArrayColumnProcessor(set(["column1", "column2"
                                  ])).process_query(unprocessed_query,
                                                    HTTPRequestSettings())
    assert unprocessed_query.get_selected_columns() == [
        SelectedExpression(
            "column2",
            FunctionCall(
                None,
                "arrayMap",
                (
                    Lambda(
                        None,
                        ("x", ),
                        FunctionCall(
                            None,
                            "replaceAll",
                            (
                                FunctionCall(None, "toString",
                                             (Argument(None, "x"), )),
                                Literal(None, "-"),
                                Literal(None, ""),
                            ),
                        ),
                    ),
                    Column(None, None, "column2"),
                ),
            ),
        )
    ]

    assert expected_query.get_condition() == unprocessed_query.get_condition()
    condition = unprocessed_query.get_condition()
    assert condition is not None
    ret = condition.accept(ClickhouseExpressionFormatter())
    assert ret == formatted_value
Exemplo n.º 12
0
def test_translation(mappers: TranslationMappers, query: SnubaQuery,
                     expected: ClickhouseQuery) -> None:
    translated = QueryTranslator(mappers).translate(query)

    # TODO: consider providing an __eq__ method to the Query class. Or turn it into
    # a dataclass.
    assert expected.get_selected_columns() == translated.get_selected_columns()
    assert expected.get_groupby() == translated.get_groupby()
    assert expected.get_condition() == translated.get_condition()
    assert expected.get_arrayjoin() == translated.get_arrayjoin()
    assert expected.get_having() == translated.get_having()
    assert expected.get_orderby() == translated.get_orderby()
Exemplo n.º 13
0
def test_spans_processor(
    query: ClickhouseQuery,
    expected_selected_columns: List[SelectedExpression],
    expected_conditions: Optional[Expression],
) -> None:
    query_settings = HTTPQuerySettings()
    bloom_filter_processor = BloomFilterOptimizer("spans", ["op", "group"],
                                                  ["exclusive_time"])
    bloom_filter_processor.process_query(query, query_settings)
    array_join_processor = ArrayJoinOptimizer("spans", ["op", "group"],
                                              ["exclusive_time"])
    array_join_processor.process_query(query, query_settings)
    assert query.get_selected_columns() == expected_selected_columns
    assert query.get_condition() == expected_conditions
Exemplo n.º 14
0
 def do_query(
     query: ClickhouseQuery,
     request_settings: RequestSettings,
     reader: Reader,
 ) -> QueryResult:
     selected_col_names = [
         c.expression.column_name
         for c in query.get_selected_columns() or []
         if isinstance(c.expression, Column)
     ]
     if selected_col_names == list(first_query_data[0].keys()):
         return QueryResult({"data": first_query_data}, {})
     elif selected_col_names == list(second_query_data[0].keys()):
         return QueryResult({"data": second_query_data}, {})
     else:
         raise ValueError(
             f"Unexpected selected columns: {selected_col_names}")
Exemplo n.º 15
0
 def query_runner(query: Query, settings: RequestSettings,
                  reader: Reader) -> QueryResult:
     quantiles = tuple(
         Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99, 1])
     assert query.get_selected_columns() == [
         SelectedExpression(
             "duration_quantiles",
             CurriedFunctionCall(
                 "_snuba_duration_quantiles",
                 FunctionCall(
                     None,
                     "quantilesIfMerge",
                     quantiles,
                 ),
                 (Column(None, None, "duration_quantiles"), ),
             ),
         ),
         SelectedExpression(
             "sessions",
             FunctionCall(
                 "_snuba_sessions",
                 "plus",
                 (
                     FunctionCall(None, "countIfMerge",
                                  (Column(None, None, "sessions"), )),
                     FunctionCall(
                         None,
                         "sumIfMerge",
                         (Column(None, None, "sessions_preaggr"), ),
                     ),
                 ),
             ),
         ),
         SelectedExpression(
             "users",
             FunctionCall("_snuba_users", "uniqIfMerge",
                          (Column(None, None, "users"), )),
         ),
     ]
     return QueryResult({}, {})
Exemplo n.º 16
0
def test_events_boolean_context() -> None:
    columns = ColumnSet(
        [("contexts", Nested([("key", String()), ("value", String())]))]
    )
    query = ClickhouseQuery(
        Table("errors", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "device.charging"),
                            ),
                        ),
                    ),
                ),
            )
        ],
    )

    expected = ClickhouseQuery(
        Table("errors", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "if",
                    (
                        binary_condition(
                            ConditionFunctions.IN,
                            FunctionCall(
                                None,
                                "arrayElement",
                                (
                                    Column(None, None, "contexts.value"),
                                    FunctionCall(
                                        None,
                                        "indexOf",
                                        (
                                            Column(None, None, "contexts.key"),
                                            Literal(None, "device.charging"),
                                        ),
                                    ),
                                ),
                            ),
                            literals_tuple(
                                None, [Literal(None, "1"), Literal(None, "True")]
                            ),
                        ),
                        Literal(None, "True"),
                        Literal(None, "False"),
                    ),
                ),
            )
        ],
    )

    settings = HTTPQuerySettings()
    EventsBooleanContextsProcessor().process_query(query, settings)

    assert query.get_selected_columns() == expected.get_selected_columns()
def find_pattern(query: Query, pattern: FunctionCall) -> bool:
    return any(
        pattern.match(f) is not None
        for selected in query.get_selected_columns() or []
        for f in selected.expression
    )
Exemplo n.º 18
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)
Exemplo n.º 19
0
def test_events_promoted_boolean_context() -> None:
    columns = ColumnSet(
        [
            ("device_charging", UInt(8, Modifier(nullable=True))),
            ("contexts", Nested([("key", String()), ("value", String())])),
        ]
    )
    query = ClickhouseQuery(
        Table("events", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "device.charging"),
                            ),
                        ),
                    ),
                ),
            )
        ],
    )

    expected = ClickhouseQuery(
        Table("events", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "if",
                    (
                        binary_condition(
                            ConditionFunctions.IN,
                            FunctionCall(
                                None,
                                "toString",
                                (Column(None, None, "device_charging"),),
                            ),
                            literals_tuple(
                                None, [Literal(None, "1"), Literal(None, "True")]
                            ),
                        ),
                        Literal(None, "True"),
                        Literal(None, "False"),
                    ),
                ),
            )
        ],
    )

    settings = HTTPQuerySettings()
    MappingColumnPromoter(
        {"contexts": {"device.charging": "device_charging"}}, cast_to_string=True
    ).process_query(query, settings)
    EventsPromotedBooleanContextsProcessor().process_query(query, settings)

    assert query.get_selected_columns() == expected.get_selected_columns()
Exemplo n.º 20
0
    def execute(
        self,
        query: Query,
        query_settings: QuerySettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        selected_columns = {
            (col.table_name, col.column_name)
            for col in query.get_columns_referenced_in_select()
        }

        if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS:
            metrics.increment("column_splitter.main_query_min_threshold")
            return None

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(
                self.__id_column,
                ColumnExpr(self.__id_column, None, self.__id_column),
            ),
            SelectedExpression(
                self.__project_column,
                ColumnExpr(self.__project_column, None, self.__project_column),
            ),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(self.__timestamp_column, None,
                           self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT,
        # they fail on distributed tables. For that specific case, skip the query splitter.
        for orderby in minimal_query.get_orderby():
            if isinstance(orderby.expression,
                          (FunctionCallExpr, CurriedFunctionCallExpr)):
                metrics.increment("column_splitter.orderby_has_a_function")
                return None

        result = runner(minimal_query, query_settings)
        del minimal_query

        if not result.result["data"]:
            metrics.increment("column_splitter.no_data_from_minimal_query")
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        query.set_limit(len(result.result["data"]))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, query_settings)
Exemplo n.º 21
0
def test_replace_expression() -> None:
    """
    Create a query with the new AST and replaces a function with a different function
    replaces f1(...) with tag(f1)
    """
    column1 = Column(None, "t1", "c1")
    column2 = Column(None, "t1", "c2")
    function_1 = FunctionCall("alias", "f1", (column1, column2))
    function_2 = FunctionCall("alias", "f2", (column2,))

    condition = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "1"))

    prewhere = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "2"))

    orderby = OrderBy(OrderByDirection.ASC, function_2)

    query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[SelectedExpression("alias", function_1)],
        array_join=None,
        condition=condition,
        groupby=[function_1],
        having=None,
        prewhere=prewhere,
        order_by=[orderby],
    )

    def replace(exp: Expression) -> Expression:
        if isinstance(exp, FunctionCall) and exp.function_name == "f1":
            return FunctionCall(exp.alias, "tag", (Literal(None, "f1"),))
        return exp

    query.transform_expressions(replace)

    expected_query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "alias", FunctionCall("alias", "tag", (Literal(None, "f1"),))
            )
        ],
        array_join=None,
        condition=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"),)),
            Literal(None, "1"),
        ),
        groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"),))],
        prewhere=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"),)),
            Literal(None, "2"),
        ),
        having=None,
        order_by=[orderby],
    )

    assert query.get_selected_columns() == expected_query.get_selected_columns()
    assert query.get_condition() == expected_query.get_condition()
    assert query.get_groupby() == expected_query.get_groupby()
    assert query.get_having() == expected_query.get_having()
    assert query.get_orderby() == expected_query.get_orderby()

    assert list(query.get_all_expressions()) == list(
        expected_query.get_all_expressions()
    )