def test_uuid_array_column_processor( unprocessed: Expression, expected: Expression, formatted_value: str, ) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=unprocessed, ) expected_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=expected, ) FixedStringArrayColumnProcessor(set(["column1", "column2"]), 32).process_query(unprocessed_query, HTTPQuerySettings()) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column2", Column(None, None, "column2"), ) ] assert expected_query.get_condition() == unprocessed_query.get_condition() condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def test_type_condition_optimizer() -> None: cond1 = binary_condition( ConditionFunctions.EQ, Column(None, None, "col1"), Literal(None, "val1") ) unprocessed_query = Query( Table("errors", ColumnSet([])), condition=binary_condition( BooleanFunctions.AND, binary_condition( ConditionFunctions.NEQ, Column(None, None, "type"), Literal(None, "transaction"), ), cond1, ), ) expected_query = Query( Table("errors", ColumnSet([])), condition=binary_condition(BooleanFunctions.AND, Literal(None, 1), cond1), ) TypeConditionOptimizer().process_query(unprocessed_query, HTTPQuerySettings()) assert expected_query.get_condition() == unprocessed_query.get_condition() condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == "1 AND equals(col1, 'val1')"
def test_uuid_array_column_processor( unprocessed: Expression, expected: Expression, formatted_value: str, ) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=unprocessed, ) expected_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=expected, ) SliceOfMapOptimizer().process_query(unprocessed_query, HTTPRequestSettings()) assert expected_query.get_condition() == unprocessed_query.get_condition() condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def test_uuid_array_column_processor( unprocessed: Expression, expected: Expression, formatted_value: str, ) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=unprocessed, ) expected_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=expected, ) UUIDArrayColumnProcessor(set(["column1", "column2" ])).process_query(unprocessed_query, HTTPRequestSettings()) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column2", FunctionCall( None, "arrayMap", ( Lambda( None, ("x", ), FunctionCall( None, "replaceAll", ( FunctionCall(None, "toString", (Argument(None, "x"), )), Literal(None, "-"), Literal(None, ""), ), ), ), Column(None, None, "column2"), ), ), ) ] assert expected_query.get_condition() == unprocessed_query.get_condition() condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def test_recursive_useless_condition( input_query: ClickhouseQuery, expected_query: ClickhouseQuery, ) -> None: # copy the condition to the having condition so that we test both being # applied in one test input_query.set_ast_having(deepcopy(input_query.get_condition())) expected_query.set_ast_having(deepcopy(expected_query.get_condition())) MappingOptimizer( column_name="tags", hash_map_name="_tags_hash_map", killswitch="tags_hash_map_enabled", ).process_query(input_query, HTTPQuerySettings()) assert input_query == expected_query
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_condition(exp: Expression) -> Expression: result = CONDITION_PATTERN.match(exp) if result is not None: key_column = result.optional_string(KEY_COL_MAPPING_PARAM) if key_column == "tags.key": rhs = result.optional_string(KEY_MAPPING_PARAM) table_name = result.optional_string(TABLE_MAPPING_PARAM) replacement = FunctionCall( exp.alias, "has", (Column(None, table_name, "tags.key"), Literal(None, rhs)), ) assert isinstance(exp, FunctionCall) if exp.function_name == ConditionFunctions.EQ: replacement = FunctionCall(exp.alias, "not", (replacement,)) prev_value = query.get_experiment_value( "empty-string-tag-condition" ) if prev_value is not None: return replacement if prev_value == "true" else exp if settings.TESTING or random.random() < 0.5: query.add_experiment("empty-string-tag-condition", "true") return replacement else: query.add_experiment("empty-string-tag-condition", "false") return exp condition = query.get_condition() if condition is not None: query.set_ast_condition(condition.transform(process_condition))
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 5) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer("project_id", ReplacerState.EVENTS).process_query( query, HTTPRequestSettings()) assert query.get_condition() == FunctionCall( None, BooleanFunctions.AND, ( FunctionCall( None, "notIn", ( FunctionCall(None, "assumeNotNull", (Column(None, None, "group_id"), )), FunctionCall( None, "tuple", ( Literal(None, 100), Literal(None, 101), Literal(None, 102), ), ), ), ), build_in("project_id", [2]), ), ) assert not query.get_from_clause().final
def process_query(self, query: Query, query_settings: QuerySettings) -> None: if not get_config(self.__killswitch, 1): return condition, cond_class = self.__get_reduced_and_classified_query_clause( query.get_condition(), query ) query.set_ast_condition(condition) if cond_class == ConditionClass.NOT_OPTIMIZABLE: return having_cond, having_cond_class = self.__get_reduced_and_classified_query_clause( query.get_having(), query ) query.set_ast_having(having_cond) if having_cond_class == ConditionClass.NOT_OPTIMIZABLE: return if not ( cond_class == ConditionClass.OPTIMIZABLE or having_cond_class == ConditionClass.OPTIMIZABLE ): return metrics.increment("optimizable_query") query.add_experiment("tags_hashmap_applied", 1) if condition is not None: query.set_ast_condition(condition.transform(self.__replace_with_hash)) if having_cond is not None: query.set_ast_having(having_cond.transform(self.__replace_with_hash))
def get_filtered_mapping_keys(query: Query, column_name: str) -> Sequence[str]: """ Identifies the conditions we can apply the arrayFilter optimization on. Which means: if the arrayJoin is in the select clause, there are one or more top level AND condition on the arrayJoin and there is no OR condition in the query. """ array_join_found = any( array_join_pattern(column_name).match(f) is not None for selected in query.get_selected_columns() or [] for f in selected.expression) if not array_join_found: return list() ast_condition = query.get_condition() cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name) if ast_condition is not None else set()) if cond_keys is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return [] ast_having = query.get_having() having_keys = (_get_mapping_keys_in_condition(ast_having, column_name) if ast_having is not None else set()) if having_keys is None: # Same as above return [] keys = cond_keys | having_keys return sorted(list(keys))
def process_query(self, query: Query, request_settings: RequestSettings) -> None: if not get_config(self.__killswitch, 1): return cond_class = ConditionClass.IRRELEVANT condition = query.get_condition() if condition is not None: cond_class = self.__classify_combined_conditions(condition) if cond_class == ConditionClass.NOT_OPTIMIZABLE: return having_cond_class = ConditionClass.IRRELEVANT having_cond = query.get_having() if having_cond is not None: having_cond_class = self.__classify_combined_conditions(having_cond) if having_cond_class == ConditionClass.NOT_OPTIMIZABLE: return if not ( cond_class == ConditionClass.OPTIMIZABLE or having_cond_class == ConditionClass.OPTIMIZABLE ): return metrics.increment("optimizable_query") if condition is not None: query.set_ast_condition(condition.transform(self.__replace_with_hash)) if having_cond is not None: query.set_ast_having(having_cond.transform(self.__replace_with_hash))
def test_multiple_not_too_many_excludes( query_with_multiple_group_ids: ClickhouseQuery, ) -> None: """ Query is looking for multiple groups and there are not too many groups to exclude, but there are fewer groups queried for than replaced. """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_multiple_group_ids, True) state.set_config("max_group_ids_exclude", 5) enforcer.process_query(query_with_multiple_group_ids, HTTPQuerySettings()) assert query_with_multiple_group_ids.get_condition() == build_and( build_not_in("group_id", [101, 102]), build_and(build_in("project_id", [2]), build_in("group_id", [101, 102])), ) assert not query_with_multiple_group_ids.get_from_clause().final
def test_single_too_many_exclude( query_with_single_group_id: ClickhouseQuery) -> None: """ Query is looking for a group that has been replaced, and there are too many groups to exclude. """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_single_group_id, True) state.set_config("max_group_ids_exclude", 2) enforcer.process_query(query_with_single_group_id, HTTPQuerySettings()) assert query_with_single_group_id.get_condition() == build_and( build_not_in("group_id", [101]), build_and(build_in("project_id", [2]), build_in("group_id", [101])), ) assert not query_with_single_group_id.get_from_clause().final
def process_query(self, query: Query, query_settings: QuerySettings) -> None: def process_condition(exp: Expression) -> Expression: result = CONDITION_PATTERN.match(exp) if result is not None: key_column = result.optional_string(KEY_COL_MAPPING_PARAM) if key_column == "tags.key": rhs = result.optional_string(KEY_MAPPING_PARAM) table_name = result.optional_string(TABLE_MAPPING_PARAM) replacement = FunctionCall( exp.alias, "has", (Column(None, table_name, "tags.key"), Literal(None, rhs)), ) assert isinstance(exp, FunctionCall) if exp.function_name == ConditionFunctions.EQ: replacement = FunctionCall(exp.alias, "not", (replacement, )) return replacement return exp condition = query.get_condition() if condition is not None: query.set_ast_condition(condition.transform(process_condition))
def get_filtered_mapping_keys( query: Query, extractors: Sequence[Extractor[T]], is_skippable_condition: Callable[[Expression], bool], ) -> Sequence[T]: """ Identifies the conditions we can apply the arrayFilter optimization on. Which means: if the arrayJoin is in the select clause, there are one or more top level AND condition on the arrayJoin and there is no OR condition in the query. """ ast_condition = query.get_condition() cond_keys: Optional[Set[T]] = ( get_mapping_keys_in_condition(ast_condition, extractors, is_skippable_condition) if ast_condition is not None else set() ) if cond_keys is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return [] ast_having = query.get_having() having_keys: Optional[Set[T]] = ( get_mapping_keys_in_condition(ast_having, extractors, is_skippable_condition) if ast_having is not None else set() ) if having_keys is None: # Same as above return [] keys = cond_keys | having_keys return sorted(list(keys))
def test_hexint_column_processor(unprocessed: Expression, formatted_value: str) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column1", Column(None, None, "column1")) ], condition=unprocessed, ) HexIntColumnProcessor(set(["column1" ])).process_query(unprocessed_query, HTTPQuerySettings()) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column1", FunctionCall( None, "lower", (FunctionCall( None, "hex", (Column(None, None, "column1"), ), ), ), ), ) ] condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def process_query(self, query: Query, request_settings: RequestSettings) -> None: missing_checkers = {checker for checker in self.__condition_checkers} def inspect_expression(condition: Expression) -> None: top_level = get_first_level_and_conditions(condition) for condition in top_level: for checker in self.__condition_checkers: if checker in missing_checkers: if checker.check(condition): missing_checkers.remove(checker) condition = query.get_condition() if condition is not None: inspect_expression(condition) prewhere = query.get_prewhere_ast() if prewhere is not None: inspect_expression(prewhere) missing_ids = {checker.get_id() for checker in missing_checkers} if get_config("mandatory_condition_enforce", 0): assert ( not missing_checkers ), f"Missing mandatory columns in query. Missing {missing_ids}" else: if missing_checkers: logger.error( "Query is missing mandatory columns", extra={"missing_checkers": missing_ids}, )
def test_without_turbo_without_projects_needing_final( query: ClickhouseQuery) -> None: PostReplacementConsistencyEnforcer("project_id", None).process_query( query, HTTPQuerySettings()) assert query.get_condition() == build_in("project_id", [2]) assert not query.get_from_clause().final
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 5) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS).process_query( query, HTTPQuerySettings()) assert query.get_condition() == build_and( FunctionCall( None, "notIn", ( FunctionCall(None, "assumeNotNull", (Column(None, None, "group_id"), )), FunctionCall( None, "tuple", ( Literal(None, 100), Literal(None, 101), Literal(None, 102), ), ), ), ), build_in("project_id", [2]), ) assert not query.get_from_clause().final
def test_tags_hash_map(query: ClickhouseQuery, expected_condition: Expression,) -> None: set_config("tags_hash_map_enabled", 1) MappingOptimizer( column_name="tags", hash_map_name="_tags_hash_map", killswitch="tags_hash_map_enabled", ).process_query(query, HTTPRequestSettings()) assert query.get_condition() == expected_condition
def query_verifier(query: Query, settings: QuerySettings, reader: Reader) -> None: class ConditionVisitor(NoopVisitor): def __init__(self) -> None: self.found_hashmap_condition = False def visit_function_call(self, exp: FunctionCall) -> None: assert exp.function_name != "arrayElement" if ( exp.function_name == "has" and isinstance(exp.parameters[0], Column) and exp.parameters[0].column_name == "_tags_hash_map" ): self.found_hashmap_condition = True return super().visit_function_call(exp) visitor = ConditionVisitor() query.get_condition().accept(visitor) assert visitor.found_hashmap_condition
def test_tags_processor(query_body: MutableMapping[str, Any], expected_query: ClickhouseQuery) -> None: """ Tests the whole processing in some notable cases. """ processed = parse_and_process(query_body) assert processed.get_selected_columns( ) == expected_query.get_selected_columns() assert processed.get_condition() == expected_query.get_condition() assert processed.get_having() == expected_query.get_having()
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 2) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer("project_id", ReplacerState.EVENTS).process_query( query, HTTPRequestSettings()) assert query.get_condition() == build_in("project_id", [2]) assert query.get_from_clause().final
def test_without_turbo_with_projects_needing_final( query: ClickhouseQuery) -> None: set_project_needs_final(2, ReplacerState.EVENTS) PostReplacementConsistencyEnforcer("project_id", ReplacerState.EVENTS).process_query( query, HTTPRequestSettings()) assert query.get_condition() == build_in("project_id", [2]) assert query.get_from_clause().final
def test_translation(mappers: TranslationMappers, query: SnubaQuery, expected: ClickhouseQuery) -> None: translated = QueryTranslator(mappers).translate(query) # TODO: consider providing an __eq__ method to the Query class. Or turn it into # a dataclass. assert expected.get_selected_columns() == translated.get_selected_columns() assert expected.get_groupby() == translated.get_groupby() assert expected.get_condition() == translated.get_condition() assert expected.get_arrayjoin() == translated.get_arrayjoin() assert expected.get_having() == translated.get_having() assert expected.get_orderby() == translated.get_orderby()
def process_query(self, query: Query, request_settings: RequestSettings) -> None: query.transform_expressions( self._process_expressions, skip_transform_condition=True ) condition = query.get_condition() if condition is not None: processed = condition.transform(self.__process_optimizable_condition) if processed == condition: processed = condition.transform(self._process_expressions) query.set_ast_condition(processed)
def test_spans_processor( query: ClickhouseQuery, expected_selected_columns: List[SelectedExpression], expected_conditions: Optional[Expression], ) -> None: query_settings = HTTPQuerySettings() bloom_filter_processor = BloomFilterOptimizer("spans", ["op", "group"], ["exclusive_time"]) bloom_filter_processor.process_query(query, query_settings) array_join_processor = ArrayJoinOptimizer("spans", ["op", "group"], ["exclusive_time"]) array_join_processor.process_query(query, query_settings) assert query.get_selected_columns() == expected_selected_columns assert query.get_condition() == expected_conditions
def test_without_turbo_with_projects_needing_final( query: ClickhouseQuery) -> None: set_project_needs_final( 2, ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS).process_query( query, HTTPQuerySettings()) assert query.get_condition() == build_in("project_id", [2]) assert query.get_from_clause().final
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 2) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS).process_query( query, HTTPQuerySettings()) assert query.get_condition() == build_in("project_id", [2]) assert query.get_from_clause().final
def process_query(self, query: Query, query_settings: QuerySettings) -> None: query.transform_expressions(self._process_expressions, skip_transform_condition=True) condition = query.get_condition() if condition is not None: if self.__contains_unoptimizable_condition(condition): processed = condition.transform(self._process_expressions) else: processed = condition.transform( self.__process_optimizable_condition) if condition == processed: processed = processed.transform(self._process_expressions) query.set_ast_condition(processed)
def test_no_groups_too_many_excludes(query: ClickhouseQuery) -> None: """ Query has no groups, and too many to exclude. """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query, True) state.set_config("max_group_ids_exclude", 1) enforcer.process_query(query, HTTPQuerySettings()) assert query.get_condition() == build_in("project_id", [2]) assert query.get_from_clause().final