def filter_key_values(key_values: Expression, keys: Sequence[LiteralExpr]) -> Expression: """ Filter an array of key value pairs based on a sequence of keys (tag keys in this case). """ return FunctionCallExpr( None, "arrayFilter", ( Lambda( None, ("pair", ), in_condition( # A pair here is a tuple with two elements (key # and value) and the index of the first element in # Clickhouse is 1 instead of 0. tupleElement( None, Argument(None, "pair"), LiteralExpr(None, 1), ), keys, ), ), key_values, ), )
def filter_column(column: Expression, keys: Sequence[LiteralExpr]) -> Expression: return FunctionCallExpr( None, "arrayFilter", (Lambda(None, ("x", ), in_condition(Argument(None, "x"), keys)), column), )
def filter_keys(column: Expression, keys: Sequence[LiteralExpr]) -> Expression: """ Filter a Column array based on a sequence of keys. """ return FunctionCallExpr( None, "arrayFilter", ( Lambda(None, ("tag",), in_condition(None, Argument(None, "tag"), keys),), column, ), )
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: project_ids = util.to_list(extension_data["project"]) if project_ids: query.add_condition_to_ast( in_condition( Column(None, None, self.__project_column), [Literal(None, p) for p in project_ids], )) request_settings.add_rate_limit( self._get_rate_limit_params(project_ids))
def _and(ex1: Expression, ex2: Expression) -> FunctionCall: return binary_condition(BooleanFunctions.AND, ex1, ex2) # `errors > 0` has_errors = binary_condition(ConditionFunctions.GT, Column(None, None, "errors"), Literal(None, 0)) # `distinct_id != NIL` did_not_nil = neq(distinct_id, lit_nil) # `duration != MAX AND status == 1` duration_condition = _and(neq(duration, Literal(None, MAX_UINT32)), eq(status, Literal(None, 1))) # `status IN (2,3,4)` terminal_status = in_condition(status, [Literal(None, status) for status in [2, 3, 4]]) # These here are basically the same statements as the matview query sessions_raw_translators = TranslationMappers(columns=[ ColumnToCurriedFunction( None, "duration_quantiles", FunctionCall(None, "quantilesIf", quantiles), (duration, duration_condition), ), ColumnToFunction(None, "duration_avg", "avgIf", (duration, duration_condition)), ColumnToFunction( None, "sessions", "sumIf",
from snuba.query.processors.conditions_enforcer import ( MandatoryConditionEnforcer, OrgIdEnforcer, ProjectIdEnforcer, ) from snuba.request.request_settings import HTTPRequestSettings from snuba.state import set_config test_data = [ pytest.param( Query( Table("errors", ColumnSet([])), selected_columns=[], condition=binary_condition( BooleanFunctions.AND, in_condition(Column(None, None, "project_id"), [Literal(None, 123)]), binary_condition( "equals", Column(None, None, "org_id"), Literal(None, 1) ), ), ), True, id="Valid query. Both mandatory columns are there", ), pytest.param( Query( Table("errors", ColumnSet([])), selected_columns=[], condition=binary_condition( BooleanFunctions.AND, binary_condition(
build_query(), set(), id="no op filter", ), pytest.param( build_query(condition=binary_condition( ConditionFunctions.EQ, spans_op_col, Literal(None, "db"), ), ), {"db"}, id="simple equality", ), pytest.param( build_query(condition=in_condition( spans_op_col, [Literal(None, "db"), Literal(None, "http")], ), ), {"db", "http"}, id="op IN condition", ), pytest.param( build_query( condition=in_condition( spans_op_col, [Literal(None, "db")], ), having=in_condition( spans_op_col, [Literal(None, "http")], ), ),
binary_condition( BooleanFunctions.AND, binary_condition( BooleanFunctions.AND, binary_condition( ConditionFunctions.GTE, Column("_snuba_timestamp", None, "timestamp"), Literal(None, datetime(2011, 7, 1, 19, 54, 15)), ), binary_condition( ConditionFunctions.LT, Column("_snuba_timestamp", None, "timestamp"), Literal(None, datetime(2018, 7, 6, 19, 54, 15)), ), ), in_condition(Column("_snuba_project_id", None, "project_id"), [Literal(None, 1)]), ), id="Legacy query", ), pytest.param( { "query": ("MATCH (events) " "SELECT count() AS count BY time " "WHERE " "project_id IN tuple(1) AND " "timestamp >= toDateTime('2011-07-01T19:54:15') AND" "timestamp < toDateTime('2018-07-06T19:54:15') " "LIMIT 1000 " "GRANULARITY 60"), }, Language.SNQL,
id="simple equality", ), pytest.param( build_query( selected_columns=[ FunctionCall( "tags_key", "arrayJoin", (Column(None, None, "tags.key"), ), ), ], condition=in_condition( FunctionCall( "tags_key", "arrayJoin", (Column(None, None, "tags.key"), ), ), [Literal(None, "tag1"), Literal(None, "tag2")], ), ), {"tag1", "tag2"}, id="tag IN condition", ), pytest.param( build_query( selected_columns=[ FunctionCall( "tags_key", "arrayJoin", (Column(None, None, "tags.key"), ),
def test_tags_expander() -> None: query_body = { "selected_columns": [ ["f1", ["tags_key", "column2"], "f1_alias"], ["f2", [], "f2_alias"], ], "aggregations": [ ["count", "platform", "platforms"], ["testF", ["platform", "tags_value"], "top_platforms"], ], "conditions": [["tags_key", "=", "tags_key"]], "having": [["tags_value", "IN", ["tag"]]], } events = get_dataset("events") query = parse_query(query_body, events) processor = TagsExpanderProcessor() request_settings = HTTPRequestSettings() processor.process_query(query, request_settings) assert query.get_selected_columns_from_ast() == [ SelectedExpression( "platforms", FunctionCall("platforms", "count", (Column("platform", None, "platform"), )), ), SelectedExpression( "top_platforms", FunctionCall( "top_platforms", "testF", ( Column("platform", None, "platform"), FunctionCall("tags_value", "arrayJoin", (Column(None, None, "tags.value"), )), ), ), ), SelectedExpression( "f1_alias", FunctionCall( "f1_alias", "f1", ( FunctionCall("tags_key", "arrayJoin", (Column(None, None, "tags.key"), )), Column("column2", None, "column2"), ), ), ), SelectedExpression("f2_alias", FunctionCall("f2_alias", "f2", tuple())), ] assert query.get_condition_from_ast() == binary_condition( None, OPERATOR_TO_FUNCTION["="], FunctionCall("tags_key", "arrayJoin", (Column(None, None, "tags.key"), )), Literal(None, "tags_key"), ) assert query.get_having_from_ast() == in_condition( None, FunctionCall("tags_value", "arrayJoin", (Column(None, None, "tags.value"), )), [Literal(None, "tag")], )
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or query.get_aggregations() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition_from_ast() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. total_columns = {(col.table_name, col.column_name) for col in query.get_all_ast_referenced_columns()} minimal_query = copy.deepcopy(query) minimal_query.set_selected_columns( [self.__id_column, self.__project_column, self.__timestamp_column]) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression(self.__id_column, ColumnExpr(None, None, self.__id_column)), SelectedExpression(self.__project_column, ColumnExpr(None, None, self.__project_column)), SelectedExpression( self.__timestamp_column, ColumnExpr(None, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) minimal_columns = { (col.table_name, col.column_name) for col in minimal_query.get_all_ast_referenced_columns() } if len(total_columns) <= len(minimal_columns): return None # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None legacy_references = set(minimal_query.get_all_referenced_columns()) ast_column_names = { c.column_name for c in minimal_query.get_all_ast_referenced_columns() } # Ensures the legacy minimal query (which does not expand alias references) # does not contain alias references we removed when creating minimal_query. if legacy_references - ast_column_names: metrics.increment("columns.skip_invalid_legacy_query") return None result = runner(minimal_query, request_settings) del minimal_query if not result.result["data"]: return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_conditions([(self.__id_column, "IN", event_ids)]) query.add_condition_to_ast( in_condition( None, ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) # TODO: This is technically wrong. Event ids are unique per project, not globally. # So, if the minimal query only returned the same event_id from two projects, we # would be underestimating the limit here. query.set_limit(len(event_ids)) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_condition( query, self.__project_column, "IN", project_ids, ) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_condition( query, self.__timestamp_column, ">=", util.parse_datetime(min(timestamps)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_condition( query, self.__timestamp_column, "<", (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, request_settings)
def execute( self, query: Query, query_settings: QuerySettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. selected_columns = { (col.table_name, col.column_name) for col in query.get_columns_referenced_in_select() } if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS: metrics.increment("column_splitter.main_query_min_threshold") return None minimal_query = copy.deepcopy(query) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression( self.__id_column, ColumnExpr(self.__id_column, None, self.__id_column), ), SelectedExpression( self.__project_column, ColumnExpr(self.__project_column, None, self.__project_column), ), SelectedExpression( self.__timestamp_column, ColumnExpr(self.__timestamp_column, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT, # they fail on distributed tables. For that specific case, skip the query splitter. for orderby in minimal_query.get_orderby(): if isinstance(orderby.expression, (FunctionCallExpr, CurriedFunctionCallExpr)): metrics.increment("column_splitter.orderby_has_a_function") return None result = runner(minimal_query, query_settings) del minimal_query if not result.result["data"]: metrics.increment("column_splitter.no_data_from_minimal_query") return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_condition_to_ast( in_condition( ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) query.set_limit(len(result.result["data"])) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, query_settings)
def test_tags_expander() -> None: query_body = """ MATCH (events) SELECT count(platform) AS platforms, testF(platform, tags_value) AS top_platforms, f1(tags_key, column2) AS f1_alias, f2() AS f2_alias WHERE tags_key = 'tags_key' AND project_id = 1 AND timestamp >= toDateTime('2020-01-01 12:00:00') AND timestamp < toDateTime('2020-01-02 12:00:00') HAVING tags_value IN tuple('tag') """ events = get_dataset("events") query, _ = parse_snql_query(query_body, events) processor = TagsExpanderProcessor() query_settings = HTTPQuerySettings() processor.process_query(query, query_settings) assert query.get_selected_columns() == [ SelectedExpression( "platforms", FunctionCall( "_snuba_platforms", "count", (Column("_snuba_platform", None, "platform"), ), ), ), SelectedExpression( "top_platforms", FunctionCall( "_snuba_top_platforms", "testF", ( Column("_snuba_platform", None, "platform"), FunctionCall( "_snuba_tags_value", "arrayJoin", (Column(None, None, "tags.value"), ), ), ), ), ), SelectedExpression( "f1_alias", FunctionCall( "_snuba_f1_alias", "f1", ( FunctionCall( "_snuba_tags_key", "arrayJoin", (Column(None, None, "tags.key"), ), ), Column("_snuba_column2", None, "column2"), ), ), ), SelectedExpression("f2_alias", FunctionCall("_snuba_f2_alias", "f2", tuple())), ] condition = query.get_condition() assert condition is not None conds = get_first_level_and_conditions(condition) assert conds[0] == binary_condition( OPERATOR_TO_FUNCTION["="], FunctionCall("_snuba_tags_key", "arrayJoin", (Column(None, None, "tags.key"), )), Literal(None, "tags_key"), ) assert query.get_having() == in_condition( FunctionCall("_snuba_tags_value", "arrayJoin", (Column(None, None, "tags.value"), )), [Literal(None, "tag")], )