Exemplo n.º 1
0
def test_query_overlaps_replacements_processor(
    query: ClickhouseQuery,
    query_with_timestamp: ClickhouseQuery,
    query_with_future_timestamp: ClickhouseQuery,
) -> None:
    enforcer = PostReplacementConsistencyEnforcer("project_id",
                                                  ReplacerState.ERRORS)

    # replacement time unknown, default to "overlaps" but no groups to exclude so shouldn't be final
    enforcer._set_query_final(query_with_timestamp, True)
    enforcer.process_query(query_with_timestamp, HTTPQuerySettings())
    assert not query_with_timestamp.get_from_clause().final

    # overlaps replacement and should be final due to too many groups to exclude
    state.set_config("max_group_ids_exclude", 2)
    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )
    enforcer._set_query_final(query_with_timestamp, False)
    enforcer.process_query(query_with_timestamp, HTTPQuerySettings())
    assert query_with_timestamp.get_from_clause().final

    # query time range unknown and should be final due to too many groups to exclude
    enforcer._set_query_final(query, False)
    enforcer.process_query(query, HTTPQuerySettings())
    assert query.get_from_clause().final

    # doesn't overlap replacements
    enforcer._set_query_final(query_with_future_timestamp, True)
    enforcer.process_query(query_with_future_timestamp, HTTPQuerySettings())
    assert not query_with_future_timestamp.get_from_clause().final
Exemplo n.º 2
0
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 5)
    set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS)

    PostReplacementConsistencyEnforcer(
        "project_id", ReplacerState.EVENTS
    ).process_query(query, HTTPRequestSettings())

    assert query.get_condition_from_ast() == FunctionCall(
        None,
        BooleanFunctions.AND,
        (
            FunctionCall(
                None,
                "notIn",
                (
                    FunctionCall(
                        None, "assumeNotNull", (Column(None, None, "group_id"),)
                    ),
                    FunctionCall(
                        None,
                        "tuple",
                        (Literal(None, 100), Literal(None, 101), Literal(None, 102),),
                    ),
                ),
            ),
            build_in("project_id", [2]),
        ),
    )
    assert not query.get_from_clause().final
Exemplo n.º 3
0
    def __init__(self, query: Query, settings: RequestSettings,) -> None:
        # Clickhouse query structure
        # Referencing them here directly since it makes it easier
        # to process this query independently from the Clickhouse Query
        # and there is no risk in doing so since they are immutable.
        self.__selected_columns = query.get_selected_columns_from_ast()
        self.__condition = query.get_condition_from_ast()
        self.__groupby = query.get_groupby_from_ast()
        self.__having = query.get_having_from_ast()
        self.__orderby = query.get_orderby_from_ast()
        self.__data_source = query.get_from_clause()
        self.__arrayjoin = query.get_arrayjoin_from_ast()
        self.__granularity = query.get_granularity()
        self.__limit = query.get_limit()
        self.__limitby = query.get_limitby()
        self.__offset = query.get_offset()

        if self.__having:
            assert self.__groupby, "found HAVING clause with no GROUP BY"

        self.__turbo = settings.get_turbo()
        self.__final = query.get_final()
        self.__sample = query.get_sample()
        self.__hastotals = query.has_totals()
        self.__prewhere = query.get_prewhere_ast()

        self.__settings = settings
        self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None
        self.__formatted_query: Optional[str] = None
        self.__sql_data: Optional[Mapping[str, str]] = None
Exemplo n.º 4
0
def test_multiple_not_too_many_excludes(
    query_with_multiple_group_ids: ClickhouseQuery, ) -> None:
    """
    Query is looking for multiple groups and there are not too many groups to exclude, but
    there are fewer groups queried for than replaced.
    """
    enforcer = PostReplacementConsistencyEnforcer("project_id",
                                                  ReplacerState.ERRORS)

    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    enforcer._set_query_final(query_with_multiple_group_ids, True)
    state.set_config("max_group_ids_exclude", 5)

    enforcer.process_query(query_with_multiple_group_ids, HTTPQuerySettings())
    assert query_with_multiple_group_ids.get_condition() == build_and(
        build_not_in("group_id", [101, 102]),
        build_and(build_in("project_id", [2]),
                  build_in("group_id", [101, 102])),
    )
    assert not query_with_multiple_group_ids.get_from_clause().final
Exemplo n.º 5
0
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 5)
    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    PostReplacementConsistencyEnforcer("project_id",
                                       ReplacerState.ERRORS).process_query(
                                           query, HTTPQuerySettings())

    assert query.get_condition() == build_and(
        FunctionCall(
            None,
            "notIn",
            (
                FunctionCall(None, "assumeNotNull",
                             (Column(None, None, "group_id"), )),
                FunctionCall(
                    None,
                    "tuple",
                    (
                        Literal(None, 100),
                        Literal(None, 101),
                        Literal(None, 102),
                    ),
                ),
            ),
        ),
        build_in("project_id", [2]),
    )
    assert not query.get_from_clause().final
Exemplo n.º 6
0
def test_without_turbo_without_projects_needing_final(query: ClickhouseQuery) -> None:
    PostReplacementConsistencyEnforcer("project_id", None).process_query(
        query, HTTPRequestSettings()
    )

    assert query.get_condition_from_ast() == build_in("project_id", [2])
    assert not query.get_from_clause().final
Exemplo n.º 7
0
def test_single_too_many_exclude(
        query_with_single_group_id: ClickhouseQuery) -> None:
    """
    Query is looking for a group that has been replaced, and there are too many
    groups to exclude.
    """
    enforcer = PostReplacementConsistencyEnforcer("project_id",
                                                  ReplacerState.ERRORS)

    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    enforcer._set_query_final(query_with_single_group_id, True)
    state.set_config("max_group_ids_exclude", 2)

    enforcer.process_query(query_with_single_group_id, HTTPQuerySettings())
    assert query_with_single_group_id.get_condition() == build_and(
        build_not_in("group_id", [101]),
        build_and(build_in("project_id", [2]), build_in("group_id", [101])),
    )
    assert not query_with_single_group_id.get_from_clause().final
Exemplo n.º 8
0
 def _set_query_final(self, query: Query, final: bool) -> None:
     """
     Set the 'final' clause of a Query.
     A query set as final will force ClickHouse to perform a merge
     on the results of the query. This is very performance heavy and
     should be avoided whenever possible.
     """
     query.set_from_clause(replace(query.get_from_clause(), final=final))
Exemplo n.º 9
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:

        mandatory_conditions = query.get_from_clause().mandatory_conditions

        if len(mandatory_conditions) > 0:
            query.add_condition_to_ast(
                combine_and_conditions(mandatory_conditions))
Exemplo n.º 10
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition_from_ast()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Exemplo n.º 11
0
    def process_query(self, query: Query, request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (
            self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS
        )
        prewhere_keys = query.get_from_clause().prewhere_candidates
        if not prewhere_keys:
            return

        ast_condition = query.get_condition_from_ast()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS
            and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond)
            )
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [
                (
                    min(
                        prewhere_keys.index(col.column_name)
                        for col in cols
                        if col.column_name in prewhere_keys
                    ),
                    cond,
                )
                for cols, cond in prewhere_candidates
            ],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates][
            :max_prewhere_conditions
        ]

        new_conditions = [
            cond
            for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None
        )
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions) if prewhere_conditions else None
        )
Exemplo n.º 12
0
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 2)
    set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS)

    PostReplacementConsistencyEnforcer(
        "project_id", ReplacerState.EVENTS
    ).process_query(query, HTTPRequestSettings())

    assert query.get_condition_from_ast() == build_in("project_id", [2])
    assert query.get_from_clause().final
Exemplo n.º 13
0
def test_without_turbo_with_projects_needing_final(
        query: ClickhouseQuery) -> None:
    set_project_needs_final(
        2,
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    PostReplacementConsistencyEnforcer("project_id",
                                       ReplacerState.ERRORS).process_query(
                                           query, HTTPQuerySettings())

    assert query.get_condition() == build_in("project_id", [2])
    assert query.get_from_clause().final
Exemplo n.º 14
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        table_name = query.get_from_clause().table_name
        (per_second, concurr) = get_configs([
            (f"table_per_second_limit_{table_name}{self.__suffix}", 5000),
            (f"table_concurrent_limit_{table_name}{self.__suffix}", 1000),
        ])

        rate_limit = RateLimitParameters(
            rate_limit_name=TABLE_RATE_LIMIT_NAME,
            bucket=table_name,
            per_second_limit=per_second,
            concurrent_limit=concurr,
        )

        query_settings.add_rate_limit(rate_limit)
Exemplo n.º 15
0
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 2)
    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    PostReplacementConsistencyEnforcer("project_id",
                                       ReplacerState.ERRORS).process_query(
                                           query, HTTPQuerySettings())

    assert query.get_condition() == build_in("project_id", [2])
    assert query.get_from_clause().final
Exemplo n.º 16
0
def test_query_parameters() -> None:
    query = Query(
        Table("my_table", ColumnSet([])),
        limitby=(100, "environment"),
        limit=100,
        offset=50,
        totals=True,
        granularity=60,
    )

    assert query.get_limitby() == (100, "environment")
    assert query.get_limit() == 100
    assert query.get_offset() == 50
    assert query.has_totals() is True
    assert query.get_granularity() == 60

    assert query.get_from_clause().table_name == "my_table"
Exemplo n.º 17
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    referrer: str,
    clickhouse_query: Query,
    request_settings: RequestSettings,
    reader: Reader[SqlQuery],
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """
    source = clickhouse_query.get_from_clause().format_from()
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        formatted_query = AstSqlQuery(clickhouse_query, request_settings)
        span.set_data("query", formatted_query.sql_data())
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": source,
        "final": clickhouse_query.get_final(),
        "referrer": referrer,
        "num_days": (to_date - from_date).days,
        "sample": clickhouse_query.get_sample(),
    }

    with sentry_sdk.start_span(description=formatted_query.format_sql(),
                               op="db") as span:
        span.set_tag("table", source)

        return raw_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            query_metadata,
            stats,
            span.trace_id,
        )
Exemplo n.º 18
0
def test_no_groups_too_many_excludes(query: ClickhouseQuery) -> None:
    """
    Query has no groups, and too many to exclude.
    """
    enforcer = PostReplacementConsistencyEnforcer("project_id",
                                                  ReplacerState.ERRORS)

    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    enforcer._set_query_final(query, True)
    state.set_config("max_group_ids_exclude", 1)

    enforcer.process_query(query, HTTPQuerySettings())
    assert query.get_condition() == build_in("project_id", [2])
    assert query.get_from_clause().final
Exemplo n.º 19
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        if request_settings.get_turbo():
            return

        project_ids = get_project_ids_in_query_ast(query,
                                                   self.__project_column)

        set_final = False
        if project_ids:
            final, exclude_group_ids = get_projects_query_flags(
                list(project_ids),
                self.__replacer_state_name,
            )
            if final:
                metrics.increment("final", tags={"cause": "final_flag"})
            if not final and exclude_group_ids:
                # If the number of groups to exclude exceeds our limit, the query
                # should just use final instead of the exclusion set.
                max_group_ids_exclude = get_config(
                    "max_group_ids_exclude",
                    settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE)
                assert isinstance(max_group_ids_exclude, int)
                if len(exclude_group_ids) > max_group_ids_exclude:
                    metrics.increment("final", tags={"cause": "max_groups"})
                    set_final = True
                else:
                    query.add_condition_to_ast(
                        not_in_condition(
                            FunctionCall(None, "assumeNotNull",
                                         (Column(None, None, "group_id"), )),
                            [Literal(None, p) for p in exclude_group_ids],
                        ))
            else:
                set_final = final

        query.set_from_clause(replace(query.get_from_clause(),
                                      final=set_final))
Exemplo n.º 20
0
def test_multiple_disjoint_replaced(
    query_with_multiple_group_ids: ClickhouseQuery, ) -> None:
    """
    Query is looking for multiple groups and there are replaced groups, but these
    sets of group ids are disjoint. (No queried groups have been replaced)
    """
    enforcer = PostReplacementConsistencyEnforcer("project_id",
                                                  ReplacerState.ERRORS)

    set_project_exclude_groups(
        2,
        [110, 120, 130],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    enforcer._set_query_final(query_with_multiple_group_ids, True)
    state.set_config("max_group_ids_exclude", 5)

    enforcer.process_query(query_with_multiple_group_ids, HTTPQuerySettings())
    assert query_with_multiple_group_ids.get_condition() == build_and(
        build_in("project_id", [2]), build_in("group_id", [101, 102]))
    assert not query_with_multiple_group_ids.get_from_clause().final
Exemplo n.º 21
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # In case the query is final we cannot simply add any candidate
        # condition to the prewhere.
        # Final is applied after prewhere, so there are cases where moving
        # conditions to the prewhere could exclude from the result sets
        # rows that would be merged under the `final` condition.
        # Example, rewriting the group_id on an unmerge. If the group_id
        # is in the prewhere, final wil fail at merging the rows.
        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Exemplo n.º 22
0
def _get_table(query: Query) -> str:
    source = query.get_from_clause()
    if source is None:
        # Should never happen at this point.
        return ""
    return source.format_from()
Exemplo n.º 23
0
 def query_runner(query: Query, settings: RequestSettings,
                  reader: Reader) -> QueryResult:
     assert query.get_from_clause().table_name == expected_table
     return QueryResult({}, {})
Exemplo n.º 24
0
 def process_query(self, query: Query, query_settings: QuerySettings) -> None:
     query.set_from_clause(replace(query.get_from_clause(), final=True))
Exemplo n.º 25
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # We remove the candidates that appear in a uniq or -If aggregations
        # because a query like `countIf(col=x) .. PREWHERE col=x` can make
        # the Clickhouse server crash.
        uniq_cols: Set[str] = set()
        expressions = query.get_all_expressions()
        for exp in expressions:
            if isinstance(exp,
                          FunctionCall) and (exp.function_name == "uniq" or
                                             exp.function_name.endswith("If")):
                columns = get_columns_in_expression(exp)
                for c in columns:
                    uniq_cols.add(c.column_name)

        for col in uniq_cols:
            if col in prewhere_keys:
                metrics.increment(
                    "uniq_col_in_prewhere_candidate",
                    tags={
                        "column": col,
                        "referrer": query_settings.referrer
                    },
                )

        prewhere_keys = [key for key in prewhere_keys if key not in uniq_cols]

        # In case the query is final we cannot simply add any candidate
        # condition to the prewhere.
        # Final is applied after prewhere, so there are cases where moving
        # conditions to the prewhere could exclude from the result sets
        # rows that would be merged under the `final` condition.
        # Example, rewriting the group_id on an unmerge. If the group_id
        # is in the prewhere, final wil fail at merging the rows.
        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)