def test_format_expressions(query_body: MutableMapping[str, Any], expected_query: Query) -> None: events = get_dataset("events") query = parse_query(query_body, events) # We cannot just run == on the query objects. The content of the two # objects is different, being one the AST and the ont the AST + raw body assert (query.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast( ) assert query.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert query.get_arrayjoin_from_ast( ) == expected_query.get_arrayjoin_from_ast() assert query.get_having_from_ast() == expected_query.get_having_from_ast() assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast( )
def __get_filter_tags(self, query: Query) -> List[str]: """ Identifies the tag names we can apply the arrayFilter optimization on. Which means: if the tags_key column is in the select clause and there are one or more top level conditions on the tags_key column. """ if not state.get_config("ast_tag_processor_enabled", 0): return [] select_clause = query.get_selected_columns_from_ast() or [] tags_key_found = any(col.column_name == "tags_key" for expression in select_clause for col in expression if isinstance(col, Column)) if not tags_key_found: return [] def extract_tags_from_condition( cond: Optional[Expression], ) -> Optional[List[str]]: if not cond: return [] if any( is_binary_condition(cond, BooleanFunctions.OR) for cond in cond): return None return self.__extract_top_level_tag_conditions(cond) cond_tags_key = extract_tags_from_condition( query.get_condition_from_ast()) if cond_tags_key is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return [] having_tags_key = extract_tags_from_condition( query.get_having_from_ast()) if having_tags_key is None: # Same as above return [] return cond_tags_key + having_tags_key
def __init__( self, query: Query, settings: RequestSettings, ) -> None: # Snuba query structure # Referencing them here directly since it makes it easier # to process this query independently from the Snuba Query # and there is no risk in doing so since they are immutable. self.__selected_columns = query.get_selected_columns_from_ast() self.__condition = query.get_condition_from_ast() self.__groupby = query.get_groupby_from_ast() self.__having = query.get_having_from_ast() self.__orderby = query.get_orderby_from_ast() self.__data_source = query.get_data_source() self.__arrayjoin = query.get_arrayjoin_from_ast() self.__granularity = query.get_granularity() self.__limit = query.get_limit() self.__limitby = query.get_limitby() self.__offset = query.get_offset() if self.__having: assert self.__groupby, "found HAVING clause with no GROUP BY" # Clickhouse specific fields. Some are still in the Snuba # query and have to be moved. self.__turbo = settings.get_turbo() self.__final = query.get_final() self.__sample = query.get_sample() self.__hastotals = query.has_totals() # TODO: Pre where processing will become a step in Clickhouse Query processing # instead of being pulled from the Snuba Query self.__prewhere = query.get_prewhere_ast() self.__settings = settings self.__formatted_query: Optional[str] = None
def process_query(self, query: Query, request_settings: RequestSettings) -> None: conditions = query.get_conditions() if not conditions: return # Enable the processor only if we have enough data in the flattened # columns. Which have been deployed at BEGINNING_OF_TIME. If the query # starts earlier than that we do not apply the optimization. if self.__beginning_of_time: apply_optimization = False for condition in conditions: if (is_condition(condition) and isinstance(condition[0], str) and condition[0] in self.__timestamp_cols and condition[1] in (">=", ">") and isinstance(condition[2], str)): try: start_ts = parse_datetime(condition[2]) if (start_ts - self.__beginning_of_time).total_seconds() > 0: apply_optimization = True except Exception: # We should not get here, it means the from timestamp is malformed # Returning here is just for safety logger.error( "Cannot parse start date for NestedFieldOptimizer: %r", condition, ) return if not apply_optimization: return # Do not use flattened tags if tags are being unpacked anyway. In that case # using flattened tags only implies loading an additional column thus making # the query heavier and slower if self.__has_tags(query.get_arrayjoin_from_ast()): return if query.get_groupby_from_ast(): for expression in query.get_groupby_from_ast(): if self.__has_tags(expression): return if self.__has_tags(query.get_having_from_ast()): return if query.get_orderby_from_ast(): for orderby in query.get_orderby_from_ast(): if self.__has_tags(orderby.expression): return new_conditions = [] positive_like_expression: List[str] = [] negative_like_expression: List[str] = [] for c in conditions: keyvalue = self.__is_optimizable(c, self.__nested_col) if not keyvalue: new_conditions.append(c) else: expression = f"{escape_field(keyvalue.nested_col_key)}={escape_field(keyvalue.value)}" if keyvalue.operand == Operand.EQ: positive_like_expression.append(expression) else: negative_like_expression.append(expression) if positive_like_expression: # Positive conditions "=" are all merged together in one LIKE expression positive_like_expression = sorted(positive_like_expression) like_formatted = f"%|{'|%|'.join(positive_like_expression)}|%" new_conditions.append( [self.__flattened_col, "LIKE", like_formatted]) for expression in negative_like_expression: # Negative conditions "!=" cannot be merged together. We can still transform # them into NOT LIKE statements, but each condition has to be one # statement. not_like_formatted = f"%|{expression}|%" new_conditions.append( [self.__flattened_col, "NOT LIKE", not_like_formatted]) query.set_conditions(new_conditions)
def test_replace_expression(): """ Create a query with the new AST and replaces a function with a different function replaces f1(...) with tag(f1) """ column1 = Column(None, "c1", "t1") column2 = Column(None, "c2", "t1") function_1 = FunctionCall("alias", "f1", (column1, column2)) function_2 = FunctionCall("alias", "f2", (column2,)) condition = binary_condition( None, ConditionFunctions.EQ, function_1, Literal(None, "1") ) orderby = OrderBy(OrderByDirection.ASC, function_2) query = Query( {}, TableSource("my_table", ColumnSet([])), selected_columns=[function_1], array_join=None, condition=condition, groupby=[function_1], having=None, order_by=[orderby], ) def replace(exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "f1": return FunctionCall(exp.alias, "tag", (Literal(None, "f1"),)) return exp query.transform_expressions(replace) expected_query = Query( {}, TableSource("my_table", ColumnSet([])), selected_columns=[FunctionCall("alias", "tag", (Literal(None, "f1"),))], array_join=None, condition=binary_condition( None, ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"),)), Literal(None, "1"), ), groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"),))], having=None, order_by=[orderby], ) assert ( query.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast() ) assert query.get_condition_from_ast() == expected_query.get_condition_from_ast() assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast() assert query.get_having_from_ast() == expected_query.get_having_from_ast() assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast() assert list(query.get_all_expressions()) == list( expected_query.get_all_expressions() )