def test_format_expressions( name: str, query: ClickhouseQuery, expected_query: ClickhouseQuery ) -> None: MappingColumnPromoter({"tags": {"promoted_tag": "promoted"}}).process_query( query, HTTPQuerySettings() ) assert query.get_selected_columns() == expected_query.get_selected_columns()
def test_hexint_column_processor(unprocessed: Expression, formatted_value: str) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column1", Column(None, None, "column1")) ], condition=unprocessed, ) HexIntColumnProcessor(set(["column1" ])).process_query(unprocessed_query, HTTPQuerySettings()) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column1", FunctionCall( None, "lower", (FunctionCall( None, "hex", (Column(None, None, "column1"), ), ), ), ), ) ] condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def test_uuid_array_column_processor( unprocessed: Expression, expected: Expression, formatted_value: str, ) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=unprocessed, ) expected_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=expected, ) FixedStringArrayColumnProcessor(set(["column1", "column2"]), 32).process_query(unprocessed_query, HTTPQuerySettings()) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column2", Column(None, None, "column2"), ) ] assert expected_query.get_condition() == unprocessed_query.get_condition() condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: assert query.get_selected_columns() == [ SelectedExpression( "org_id", Column("_snuba_org_id", None, "org_id"), ), SelectedExpression( "project_id", Column("_snuba_project_id", None, "project_id"), ), SelectedExpression( "tags[10]", FunctionCall( "_snuba_tags[10]", "arrayElement", ( Column(None, None, "tags.value"), FunctionCall( None, "indexOf", (Column(None, None, "tags.key"), Literal(None, 10)), ), ), ), ), SelectedExpression( column_name, translated_value, ), ] return QueryResult({}, {})
def query_runner(query: Query, settings: QuerySettings, reader: Reader) -> QueryResult: assert query.get_selected_columns() == [ SelectedExpression( "tags[transaction]", Column("_snuba_tags[transaction]", None, "transaction_name"), ), SelectedExpression( "contexts[browser.name]", FunctionCall( "_snuba_contexts[browser.name]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "browser.name"), ), ), ), ), ), ] return QueryResult({}, {})
def get_filtered_mapping_keys(query: Query, column_name: str) -> Sequence[str]: """ Identifies the conditions we can apply the arrayFilter optimization on. Which means: if the arrayJoin is in the select clause, there are one or more top level AND condition on the arrayJoin and there is no OR condition in the query. """ array_join_found = any( array_join_pattern(column_name).match(f) is not None for selected in query.get_selected_columns() or [] for f in selected.expression) if not array_join_found: return list() ast_condition = query.get_condition() cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name) if ast_condition is not None else set()) if cond_keys is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return [] ast_having = query.get_having() having_keys = (_get_mapping_keys_in_condition(ast_having, column_name) if ast_having is not None else set()) if having_keys is None: # Same as above return [] keys = cond_keys | having_keys return sorted(list(keys))
def process_query(self, query: Query, query_settings: QuerySettings) -> None: having_clause = query.get_having() if not having_clause: return None selected_columns = query.get_selected_columns() uniq_matcher = Param("function", FunctionCallMatch(String("uniq"))) found_functions = [] for exp in having_clause: match = uniq_matcher.match(exp) if match is not None: found_functions.append(match.expression("function")) if found_functions is not None: matcher = _ExpressionOrAliasMatcher(found_functions) for col in selected_columns: col.expression.accept(matcher) if not all(matcher.found_expressions): should_throw = get_config("throw_on_uniq_select_and_having", False) error = MismatchedAggregationException( "Aggregation is in HAVING clause but not SELECT", query=str(query)) if should_throw: raise error else: logging.warning( "Aggregation is in HAVING clause but not SELECT", exc_info=True, extra=cast(Dict[str, Any], error.to_dict()), )
def test_events_column_format_expressions() -> None: unprocessed = Query( Table("events", ColumnSet([])), selected_columns=[ SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")), SelectedExpression("the_group_id", Column("the_group_id", None, "group_id")), SelectedExpression("the_message", Column("the_message", None, "message")), ], ) expected_query = Query( Table("events", ColumnSet([])), selected_columns=[ SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")), SelectedExpression( "the_group_id", FunctionCall( "the_group_id", "nullIf", ( Column(None, None, "group_id"), Literal(None, 0), ), ), ), SelectedExpression( "the_message", Column("the_message", None, "message"), ), ], ) GroupIdColumnProcessor().process_query(unprocessed, HTTPRequestSettings()) assert expected_query.get_selected_columns( ) == unprocessed.get_selected_columns() expected = ( "(nullIf(group_id, 0) AS the_group_id)", "(message AS the_message)", ) for idx, column in enumerate(unprocessed.get_selected_columns()[1:]): formatted = column.expression.accept(ClickhouseExpressionFormatter()) assert expected[idx] == formatted
def test_event_id_column_format_expressions() -> None: unprocessed = Query( Table("events", ColumnSet([])), selected_columns=[ SelectedExpression( "transaction.duration", Column("transaction.duration", None, "duration")), SelectedExpression("the_event_id", Column("the_event_id", None, "event_id")), ], ) expected = Query( Table("events", ColumnSet([])), selected_columns=[ SelectedExpression( "transaction.duration", Column("transaction.duration", None, "duration")), SelectedExpression( "the_event_id", FunctionCall( "the_event_id", "replaceAll", ( FunctionCall( None, "toString", (Column(None, None, "event_id"), ), ), Literal(None, "-"), Literal(None, ""), ), ), ), ], ) UUIDColumnProcessor({"event_id"}).process_query(unprocessed, HTTPQuerySettings()) assert expected.get_selected_columns() == unprocessed.get_selected_columns( ) formatted = unprocessed.get_selected_columns()[1].expression.accept( ClickhouseExpressionFormatter()) assert formatted == "(replaceAll(toString(event_id), '-', '') AS the_event_id)"
def test_tags_processor(query_body: MutableMapping[str, Any], expected_query: ClickhouseQuery) -> None: """ Tests the whole processing in some notable cases. """ processed = parse_and_process(query_body) assert processed.get_selected_columns( ) == expected_query.get_selected_columns() assert processed.get_condition() == expected_query.get_condition() assert processed.get_having() == expected_query.get_having()
def test_uuid_array_column_processor( unprocessed: Expression, expected: Expression, formatted_value: str, ) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=unprocessed, ) expected_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=expected, ) UUIDArrayColumnProcessor(set(["column1", "column2" ])).process_query(unprocessed_query, HTTPRequestSettings()) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column2", FunctionCall( None, "arrayMap", ( Lambda( None, ("x", ), FunctionCall( None, "replaceAll", ( FunctionCall(None, "toString", (Argument(None, "x"), )), Literal(None, "-"), Literal(None, ""), ), ), ), Column(None, None, "column2"), ), ), ) ] assert expected_query.get_condition() == unprocessed_query.get_condition() condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def test_translation(mappers: TranslationMappers, query: SnubaQuery, expected: ClickhouseQuery) -> None: translated = QueryTranslator(mappers).translate(query) # TODO: consider providing an __eq__ method to the Query class. Or turn it into # a dataclass. assert expected.get_selected_columns() == translated.get_selected_columns() assert expected.get_groupby() == translated.get_groupby() assert expected.get_condition() == translated.get_condition() assert expected.get_arrayjoin() == translated.get_arrayjoin() assert expected.get_having() == translated.get_having() assert expected.get_orderby() == translated.get_orderby()
def test_spans_processor( query: ClickhouseQuery, expected_selected_columns: List[SelectedExpression], expected_conditions: Optional[Expression], ) -> None: query_settings = HTTPQuerySettings() bloom_filter_processor = BloomFilterOptimizer("spans", ["op", "group"], ["exclusive_time"]) bloom_filter_processor.process_query(query, query_settings) array_join_processor = ArrayJoinOptimizer("spans", ["op", "group"], ["exclusive_time"]) array_join_processor.process_query(query, query_settings) assert query.get_selected_columns() == expected_selected_columns assert query.get_condition() == expected_conditions
def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader, ) -> QueryResult: selected_col_names = [ c.expression.column_name for c in query.get_selected_columns() or [] if isinstance(c.expression, Column) ] if selected_col_names == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_col_names == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError( f"Unexpected selected columns: {selected_col_names}")
def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: quantiles = tuple( Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99, 1]) assert query.get_selected_columns() == [ SelectedExpression( "duration_quantiles", CurriedFunctionCall( "_snuba_duration_quantiles", FunctionCall( None, "quantilesIfMerge", quantiles, ), (Column(None, None, "duration_quantiles"), ), ), ), SelectedExpression( "sessions", FunctionCall( "_snuba_sessions", "plus", ( FunctionCall(None, "countIfMerge", (Column(None, None, "sessions"), )), FunctionCall( None, "sumIfMerge", (Column(None, None, "sessions_preaggr"), ), ), ), ), ), SelectedExpression( "users", FunctionCall("_snuba_users", "uniqIfMerge", (Column(None, None, "users"), )), ), ] return QueryResult({}, {})
def test_events_boolean_context() -> None: columns = ColumnSet( [("contexts", Nested([("key", String()), ("value", String())]))] ) query = ClickhouseQuery( Table("errors", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), ) ], ) expected = ClickhouseQuery( Table("errors", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "if", ( binary_condition( ConditionFunctions.IN, FunctionCall( None, "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), literals_tuple( None, [Literal(None, "1"), Literal(None, "True")] ), ), Literal(None, "True"), Literal(None, "False"), ), ), ) ], ) settings = HTTPQuerySettings() EventsBooleanContextsProcessor().process_query(query, settings) assert query.get_selected_columns() == expected.get_selected_columns()
def find_pattern(query: Query, pattern: FunctionCall) -> bool: return any( pattern.match(f) is not None for selected in query.get_selected_columns() or [] for f in selected.expression )
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or query.get_aggregations() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition_from_ast() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. total_columns = {(col.table_name, col.column_name) for col in query.get_all_ast_referenced_columns()} minimal_query = copy.deepcopy(query) minimal_query.set_selected_columns( [self.__id_column, self.__project_column, self.__timestamp_column]) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression(self.__id_column, ColumnExpr(None, None, self.__id_column)), SelectedExpression(self.__project_column, ColumnExpr(None, None, self.__project_column)), SelectedExpression( self.__timestamp_column, ColumnExpr(None, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) minimal_columns = { (col.table_name, col.column_name) for col in minimal_query.get_all_ast_referenced_columns() } if len(total_columns) <= len(minimal_columns): return None # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None legacy_references = set(minimal_query.get_all_referenced_columns()) ast_column_names = { c.column_name for c in minimal_query.get_all_ast_referenced_columns() } # Ensures the legacy minimal query (which does not expand alias references) # does not contain alias references we removed when creating minimal_query. if legacy_references - ast_column_names: metrics.increment("columns.skip_invalid_legacy_query") return None result = runner(minimal_query, request_settings) del minimal_query if not result.result["data"]: return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_conditions([(self.__id_column, "IN", event_ids)]) query.add_condition_to_ast( in_condition( None, ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) # TODO: This is technically wrong. Event ids are unique per project, not globally. # So, if the minimal query only returned the same event_id from two projects, we # would be underestimating the limit here. query.set_limit(len(event_ids)) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_condition( query, self.__project_column, "IN", project_ids, ) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_condition( query, self.__timestamp_column, ">=", util.parse_datetime(min(timestamps)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_condition( query, self.__timestamp_column, "<", (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, request_settings)
def test_events_promoted_boolean_context() -> None: columns = ColumnSet( [ ("device_charging", UInt(8, Modifier(nullable=True))), ("contexts", Nested([("key", String()), ("value", String())])), ] ) query = ClickhouseQuery( Table("events", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), ) ], ) expected = ClickhouseQuery( Table("events", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "if", ( binary_condition( ConditionFunctions.IN, FunctionCall( None, "toString", (Column(None, None, "device_charging"),), ), literals_tuple( None, [Literal(None, "1"), Literal(None, "True")] ), ), Literal(None, "True"), Literal(None, "False"), ), ), ) ], ) settings = HTTPQuerySettings() MappingColumnPromoter( {"contexts": {"device.charging": "device_charging"}}, cast_to_string=True ).process_query(query, settings) EventsPromotedBooleanContextsProcessor().process_query(query, settings) assert query.get_selected_columns() == expected.get_selected_columns()
def execute( self, query: Query, query_settings: QuerySettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. selected_columns = { (col.table_name, col.column_name) for col in query.get_columns_referenced_in_select() } if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS: metrics.increment("column_splitter.main_query_min_threshold") return None minimal_query = copy.deepcopy(query) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression( self.__id_column, ColumnExpr(self.__id_column, None, self.__id_column), ), SelectedExpression( self.__project_column, ColumnExpr(self.__project_column, None, self.__project_column), ), SelectedExpression( self.__timestamp_column, ColumnExpr(self.__timestamp_column, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT, # they fail on distributed tables. For that specific case, skip the query splitter. for orderby in minimal_query.get_orderby(): if isinstance(orderby.expression, (FunctionCallExpr, CurriedFunctionCallExpr)): metrics.increment("column_splitter.orderby_has_a_function") return None result = runner(minimal_query, query_settings) del minimal_query if not result.result["data"]: metrics.increment("column_splitter.no_data_from_minimal_query") return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_condition_to_ast( in_condition( ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) query.set_limit(len(result.result["data"])) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, query_settings)
def test_replace_expression() -> None: """ Create a query with the new AST and replaces a function with a different function replaces f1(...) with tag(f1) """ column1 = Column(None, "t1", "c1") column2 = Column(None, "t1", "c2") function_1 = FunctionCall("alias", "f1", (column1, column2)) function_2 = FunctionCall("alias", "f2", (column2,)) condition = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "1")) prewhere = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "2")) orderby = OrderBy(OrderByDirection.ASC, function_2) query = Query( Table("my_table", ColumnSet([])), selected_columns=[SelectedExpression("alias", function_1)], array_join=None, condition=condition, groupby=[function_1], having=None, prewhere=prewhere, order_by=[orderby], ) def replace(exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "f1": return FunctionCall(exp.alias, "tag", (Literal(None, "f1"),)) return exp query.transform_expressions(replace) expected_query = Query( Table("my_table", ColumnSet([])), selected_columns=[ SelectedExpression( "alias", FunctionCall("alias", "tag", (Literal(None, "f1"),)) ) ], array_join=None, condition=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"),)), Literal(None, "1"), ), groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"),))], prewhere=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"),)), Literal(None, "2"), ), having=None, order_by=[orderby], ) assert query.get_selected_columns() == expected_query.get_selected_columns() assert query.get_condition() == expected_query.get_condition() assert query.get_groupby() == expected_query.get_groupby() assert query.get_having() == expected_query.get_having() assert query.get_orderby() == expected_query.get_orderby() assert list(query.get_all_expressions()) == list( expected_query.get_all_expressions() )