def test_query_overlaps_replacements_processor( query: ClickhouseQuery, query_with_timestamp: ClickhouseQuery, query_with_future_timestamp: ClickhouseQuery, ) -> None: enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) # replacement time unknown, default to "overlaps" but no groups to exclude so shouldn't be final enforcer._set_query_final(query_with_timestamp, True) enforcer.process_query(query_with_timestamp, HTTPQuerySettings()) assert not query_with_timestamp.get_from_clause().final # overlaps replacement and should be final due to too many groups to exclude state.set_config("max_group_ids_exclude", 2) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_timestamp, False) enforcer.process_query(query_with_timestamp, HTTPQuerySettings()) assert query_with_timestamp.get_from_clause().final # query time range unknown and should be final due to too many groups to exclude enforcer._set_query_final(query, False) enforcer.process_query(query, HTTPQuerySettings()) assert query.get_from_clause().final # doesn't overlap replacements enforcer._set_query_final(query_with_future_timestamp, True) enforcer.process_query(query_with_future_timestamp, HTTPQuerySettings()) assert not query_with_future_timestamp.get_from_clause().final
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 5) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer( "project_id", ReplacerState.EVENTS ).process_query(query, HTTPRequestSettings()) assert query.get_condition_from_ast() == FunctionCall( None, BooleanFunctions.AND, ( FunctionCall( None, "notIn", ( FunctionCall( None, "assumeNotNull", (Column(None, None, "group_id"),) ), FunctionCall( None, "tuple", (Literal(None, 100), Literal(None, 101), Literal(None, 102),), ), ), ), build_in("project_id", [2]), ), ) assert not query.get_from_clause().final
def __init__(self, query: Query, settings: RequestSettings,) -> None: # Clickhouse query structure # Referencing them here directly since it makes it easier # to process this query independently from the Clickhouse Query # and there is no risk in doing so since they are immutable. self.__selected_columns = query.get_selected_columns_from_ast() self.__condition = query.get_condition_from_ast() self.__groupby = query.get_groupby_from_ast() self.__having = query.get_having_from_ast() self.__orderby = query.get_orderby_from_ast() self.__data_source = query.get_from_clause() self.__arrayjoin = query.get_arrayjoin_from_ast() self.__granularity = query.get_granularity() self.__limit = query.get_limit() self.__limitby = query.get_limitby() self.__offset = query.get_offset() if self.__having: assert self.__groupby, "found HAVING clause with no GROUP BY" self.__turbo = settings.get_turbo() self.__final = query.get_final() self.__sample = query.get_sample() self.__hastotals = query.has_totals() self.__prewhere = query.get_prewhere_ast() self.__settings = settings self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None self.__formatted_query: Optional[str] = None self.__sql_data: Optional[Mapping[str, str]] = None
def test_multiple_not_too_many_excludes( query_with_multiple_group_ids: ClickhouseQuery, ) -> None: """ Query is looking for multiple groups and there are not too many groups to exclude, but there are fewer groups queried for than replaced. """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_multiple_group_ids, True) state.set_config("max_group_ids_exclude", 5) enforcer.process_query(query_with_multiple_group_ids, HTTPQuerySettings()) assert query_with_multiple_group_ids.get_condition() == build_and( build_not_in("group_id", [101, 102]), build_and(build_in("project_id", [2]), build_in("group_id", [101, 102])), ) assert not query_with_multiple_group_ids.get_from_clause().final
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 5) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS).process_query( query, HTTPQuerySettings()) assert query.get_condition() == build_and( FunctionCall( None, "notIn", ( FunctionCall(None, "assumeNotNull", (Column(None, None, "group_id"), )), FunctionCall( None, "tuple", ( Literal(None, 100), Literal(None, 101), Literal(None, 102), ), ), ), ), build_in("project_id", [2]), ) assert not query.get_from_clause().final
def test_without_turbo_without_projects_needing_final(query: ClickhouseQuery) -> None: PostReplacementConsistencyEnforcer("project_id", None).process_query( query, HTTPRequestSettings() ) assert query.get_condition_from_ast() == build_in("project_id", [2]) assert not query.get_from_clause().final
def test_single_too_many_exclude( query_with_single_group_id: ClickhouseQuery) -> None: """ Query is looking for a group that has been replaced, and there are too many groups to exclude. """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_single_group_id, True) state.set_config("max_group_ids_exclude", 2) enforcer.process_query(query_with_single_group_id, HTTPQuerySettings()) assert query_with_single_group_id.get_condition() == build_and( build_not_in("group_id", [101]), build_and(build_in("project_id", [2]), build_in("group_id", [101])), ) assert not query_with_single_group_id.get_from_clause().final
def _set_query_final(self, query: Query, final: bool) -> None: """ Set the 'final' clause of a Query. A query set as final will force ClickHouse to perform a merge on the results of the query. This is very performance heavy and should be avoided whenever possible. """ query.set_from_clause(replace(query.get_from_clause(), final=final))
def process_query(self, query: Query, query_settings: QuerySettings) -> None: mandatory_conditions = query.get_from_clause().mandatory_conditions if len(mandatory_conditions) > 0: query.add_condition_to_ast( combine_and_conditions(mandatory_conditions))
def process_query(self, query: Query, request_settings: RequestSettings) -> None: max_prewhere_conditions: int = (self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS) prewhere_keys = self.__prewhere_candidates # HACK: If query has final, do not move any condition on a column in the # omit_if_final list to prewhere. # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE # with Low Cardinality and Nullable columns. # https://github.com/ClickHouse/ClickHouse/issues/16171 if query.get_from_clause().final and self.__omit_if_final: prewhere_keys = [ key for key in prewhere_keys if key not in self.__omit_if_final ] if not prewhere_keys: return ast_condition = query.get_condition_from_ast() if ast_condition is None: return prewhere_candidates = [ (get_columns_in_expression(cond), cond) for cond in get_first_level_and_conditions(ast_condition) if isinstance(cond, FunctionCall) and cond.function_name in ALLOWED_OPERATORS and any( col.column_name in prewhere_keys for col in get_columns_in_expression(cond)) ] if not prewhere_candidates: return # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) sorted_candidates = sorted( [( min( prewhere_keys.index(col.column_name) for col in cols if col.column_name in prewhere_keys), cond, ) for cols, cond in prewhere_candidates], key=lambda priority_and_col: priority_and_col[0], ) prewhere_conditions = [cond for _, cond in sorted_candidates ][:max_prewhere_conditions] new_conditions = [ cond for cond in get_first_level_and_conditions(ast_condition) if cond not in prewhere_conditions ] query.set_ast_condition( combine_and_conditions(new_conditions) if new_conditions else None) query.set_prewhere_ast_condition( combine_and_conditions(prewhere_conditions ) if prewhere_conditions else None)
def process_query(self, query: Query, request_settings: RequestSettings) -> None: max_prewhere_conditions: int = ( self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS ) prewhere_keys = query.get_from_clause().prewhere_candidates if not prewhere_keys: return ast_condition = query.get_condition_from_ast() if ast_condition is None: return prewhere_candidates = [ (get_columns_in_expression(cond), cond) for cond in get_first_level_and_conditions(ast_condition) if isinstance(cond, FunctionCall) and cond.function_name in ALLOWED_OPERATORS and any( col.column_name in prewhere_keys for col in get_columns_in_expression(cond) ) ] if not prewhere_candidates: return # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) sorted_candidates = sorted( [ ( min( prewhere_keys.index(col.column_name) for col in cols if col.column_name in prewhere_keys ), cond, ) for cols, cond in prewhere_candidates ], key=lambda priority_and_col: priority_and_col[0], ) prewhere_conditions = [cond for _, cond in sorted_candidates][ :max_prewhere_conditions ] new_conditions = [ cond for cond in get_first_level_and_conditions(ast_condition) if cond not in prewhere_conditions ] query.set_ast_condition( combine_and_conditions(new_conditions) if new_conditions else None ) query.set_prewhere_ast_condition( combine_and_conditions(prewhere_conditions) if prewhere_conditions else None )
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 2) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer( "project_id", ReplacerState.EVENTS ).process_query(query, HTTPRequestSettings()) assert query.get_condition_from_ast() == build_in("project_id", [2]) assert query.get_from_clause().final
def test_without_turbo_with_projects_needing_final( query: ClickhouseQuery) -> None: set_project_needs_final( 2, ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS).process_query( query, HTTPQuerySettings()) assert query.get_condition() == build_in("project_id", [2]) assert query.get_from_clause().final
def process_query(self, query: Query, query_settings: QuerySettings) -> None: table_name = query.get_from_clause().table_name (per_second, concurr) = get_configs([ (f"table_per_second_limit_{table_name}{self.__suffix}", 5000), (f"table_concurrent_limit_{table_name}{self.__suffix}", 1000), ]) rate_limit = RateLimitParameters( rate_limit_name=TABLE_RATE_LIMIT_NAME, bucket=table_name, per_second_limit=per_second, concurrent_limit=concurr, ) query_settings.add_rate_limit(rate_limit)
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 2) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS).process_query( query, HTTPQuerySettings()) assert query.get_condition() == build_in("project_id", [2]) assert query.get_from_clause().final
def test_query_parameters() -> None: query = Query( Table("my_table", ColumnSet([])), limitby=(100, "environment"), limit=100, offset=50, totals=True, granularity=60, ) assert query.get_limitby() == (100, "environment") assert query.get_limit() == 100 assert query.get_offset() == 50 assert query.has_totals() is True assert query.get_granularity() == 60 assert query.get_from_clause().table_name == "my_table"
def _format_storage_query_and_run( timer: Timer, query_metadata: SnubaQueryMetadata, from_date: datetime, to_date: datetime, referrer: str, clickhouse_query: Query, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: """ Formats the Storage Query and pass it to the DB specific code for execution. """ source = clickhouse_query.get_from_clause().format_from() with sentry_sdk.start_span(description="create_query", op="db") as span: formatted_query = AstSqlQuery(clickhouse_query, request_settings) span.set_data("query", formatted_query.sql_data()) metrics.increment("execute") timer.mark("prepare_query") stats = { "clickhouse_table": source, "final": clickhouse_query.get_final(), "referrer": referrer, "num_days": (to_date - from_date).days, "sample": clickhouse_query.get_sample(), } with sentry_sdk.start_span(description=formatted_query.format_sql(), op="db") as span: span.set_tag("table", source) return raw_query( clickhouse_query, request_settings, formatted_query, reader, timer, query_metadata, stats, span.trace_id, )
def test_no_groups_too_many_excludes(query: ClickhouseQuery) -> None: """ Query has no groups, and too many to exclude. """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query, True) state.set_config("max_group_ids_exclude", 1) enforcer.process_query(query, HTTPQuerySettings()) assert query.get_condition() == build_in("project_id", [2]) assert query.get_from_clause().final
def process_query(self, query: Query, request_settings: RequestSettings) -> None: if request_settings.get_turbo(): return project_ids = get_project_ids_in_query_ast(query, self.__project_column) set_final = False if project_ids: final, exclude_group_ids = get_projects_query_flags( list(project_ids), self.__replacer_state_name, ) if final: metrics.increment("final", tags={"cause": "final_flag"}) if not final and exclude_group_ids: # If the number of groups to exclude exceeds our limit, the query # should just use final instead of the exclusion set. max_group_ids_exclude = get_config( "max_group_ids_exclude", settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE) assert isinstance(max_group_ids_exclude, int) if len(exclude_group_ids) > max_group_ids_exclude: metrics.increment("final", tags={"cause": "max_groups"}) set_final = True else: query.add_condition_to_ast( not_in_condition( FunctionCall(None, "assumeNotNull", (Column(None, None, "group_id"), )), [Literal(None, p) for p in exclude_group_ids], )) else: set_final = final query.set_from_clause(replace(query.get_from_clause(), final=set_final))
def test_multiple_disjoint_replaced( query_with_multiple_group_ids: ClickhouseQuery, ) -> None: """ Query is looking for multiple groups and there are replaced groups, but these sets of group ids are disjoint. (No queried groups have been replaced) """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [110, 120, 130], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_multiple_group_ids, True) state.set_config("max_group_ids_exclude", 5) enforcer.process_query(query_with_multiple_group_ids, HTTPQuerySettings()) assert query_with_multiple_group_ids.get_condition() == build_and( build_in("project_id", [2]), build_in("group_id", [101, 102])) assert not query_with_multiple_group_ids.get_from_clause().final
def process_query(self, query: Query, request_settings: RequestSettings) -> None: max_prewhere_conditions: int = (self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS) prewhere_keys = self.__prewhere_candidates # In case the query is final we cannot simply add any candidate # condition to the prewhere. # Final is applied after prewhere, so there are cases where moving # conditions to the prewhere could exclude from the result sets # rows that would be merged under the `final` condition. # Example, rewriting the group_id on an unmerge. If the group_id # is in the prewhere, final wil fail at merging the rows. # HACK: If query has final, do not move any condition on a column in the # omit_if_final list to prewhere. # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE # with Low Cardinality and Nullable columns. # https://github.com/ClickHouse/ClickHouse/issues/16171 if query.get_from_clause().final and self.__omit_if_final: prewhere_keys = [ key for key in prewhere_keys if key not in self.__omit_if_final ] if not prewhere_keys: return ast_condition = query.get_condition() if ast_condition is None: return prewhere_candidates = [ (get_columns_in_expression(cond), cond) for cond in get_first_level_and_conditions(ast_condition) if isinstance(cond, FunctionCall) and cond.function_name in ALLOWED_OPERATORS and any( col.column_name in prewhere_keys for col in get_columns_in_expression(cond)) ] if not prewhere_candidates: return # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) sorted_candidates = sorted( [( min( prewhere_keys.index(col.column_name) for col in cols if col.column_name in prewhere_keys), cond, ) for cols, cond in prewhere_candidates], key=lambda priority_and_col: priority_and_col[0], ) prewhere_conditions = [cond for _, cond in sorted_candidates ][:max_prewhere_conditions] new_conditions = [ cond for cond in get_first_level_and_conditions(ast_condition) if cond not in prewhere_conditions ] query.set_ast_condition( combine_and_conditions(new_conditions) if new_conditions else None) query.set_prewhere_ast_condition( combine_and_conditions(prewhere_conditions ) if prewhere_conditions else None)
def _get_table(query: Query) -> str: source = query.get_from_clause() if source is None: # Should never happen at this point. return "" return source.format_from()
def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: assert query.get_from_clause().table_name == expected_table return QueryResult({}, {})
def process_query(self, query: Query, query_settings: QuerySettings) -> None: query.set_from_clause(replace(query.get_from_clause(), final=True))
def process_query(self, query: Query, query_settings: QuerySettings) -> None: max_prewhere_conditions: int = (self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS) prewhere_keys = self.__prewhere_candidates # We remove the candidates that appear in a uniq or -If aggregations # because a query like `countIf(col=x) .. PREWHERE col=x` can make # the Clickhouse server crash. uniq_cols: Set[str] = set() expressions = query.get_all_expressions() for exp in expressions: if isinstance(exp, FunctionCall) and (exp.function_name == "uniq" or exp.function_name.endswith("If")): columns = get_columns_in_expression(exp) for c in columns: uniq_cols.add(c.column_name) for col in uniq_cols: if col in prewhere_keys: metrics.increment( "uniq_col_in_prewhere_candidate", tags={ "column": col, "referrer": query_settings.referrer }, ) prewhere_keys = [key for key in prewhere_keys if key not in uniq_cols] # In case the query is final we cannot simply add any candidate # condition to the prewhere. # Final is applied after prewhere, so there are cases where moving # conditions to the prewhere could exclude from the result sets # rows that would be merged under the `final` condition. # Example, rewriting the group_id on an unmerge. If the group_id # is in the prewhere, final wil fail at merging the rows. # HACK: If query has final, do not move any condition on a column in the # omit_if_final list to prewhere. # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE # with Low Cardinality and Nullable columns. # https://github.com/ClickHouse/ClickHouse/issues/16171 if query.get_from_clause().final and self.__omit_if_final: prewhere_keys = [ key for key in prewhere_keys if key not in self.__omit_if_final ] if not prewhere_keys: return ast_condition = query.get_condition() if ast_condition is None: return prewhere_candidates = [ (get_columns_in_expression(cond), cond) for cond in get_first_level_and_conditions(ast_condition) if isinstance(cond, FunctionCall) and cond.function_name in ALLOWED_OPERATORS and any( col.column_name in prewhere_keys for col in get_columns_in_expression(cond)) ] if not prewhere_candidates: return # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) sorted_candidates = sorted( [( min( prewhere_keys.index(col.column_name) for col in cols if col.column_name in prewhere_keys), cond, ) for cols, cond in prewhere_candidates], key=lambda priority_and_col: priority_and_col[0], ) prewhere_conditions = [cond for _, cond in sorted_candidates ][:max_prewhere_conditions] new_conditions = [ cond for cond in get_first_level_and_conditions(ast_condition) if cond not in prewhere_conditions ] query.set_ast_condition( combine_and_conditions(new_conditions) if new_conditions else None) query.set_prewhere_ast_condition( combine_and_conditions(prewhere_conditions ) if prewhere_conditions else None)