def __init__(self, uuid_columns: Set[str]) -> None: self.__unique_uuid_columns = uuid_columns self.__uuid_column_match = Or([String(u_col) for u_col in uuid_columns]) self.uuid_in_condition = FunctionCallMatch( Or((String(ConditionFunctions.IN), String(ConditionFunctions.NOT_IN))), ( self.formatted_uuid_pattern(), Param("params", FunctionCallMatch(String("tuple"), None)), ), ) self.uuid_condition = FunctionCallMatch( Or( [ String(op) for op in FUNCTION_TO_OPERATOR if op not in (ConditionFunctions.IN, ConditionFunctions.NOT_IN) ] ), ( Or( ( Param("literal_0", LiteralMatch(AnyOptionalString())), self.formatted_uuid_pattern("_0"), ) ), Or( ( Param("literal_1", LiteralMatch(AnyOptionalString())), self.formatted_uuid_pattern("_1"), ) ), ), ) self.formatted: Optional[str] = None
def _get_date_range(query: Query) -> Optional[int]: """ Best guess to find the time range for the query. We pick the first column that is compared with a datetime Literal. """ pattern = FunctionCall( Or([String(ConditionFunctions.GT), String(ConditionFunctions.GTE)]), (Column(None, Param("col_name", Any(str))), Literal(Any(datetime))), ) condition = query.get_condition_from_ast() if condition is None: return None for exp in condition: result = pattern.match(exp) if result is not None: from_date, to_date = get_time_range(query, result.string("col_name")) if from_date is None or to_date is None: return None else: return (to_date - from_date).days return None
def condition_pattern( operators: Set[str], lhs_pattern: Pattern[Expression], rhs_pattern: Pattern[Expression], commutative: bool, ) -> Pattern[Expression]: """ Matches a binary condition given the two operands and the valid operators. It also supports commutative conditions. """ pattern: Pattern[Expression] if commutative: pattern = Or( [ FunctionCallPattern( Or([String(op) for op in operators]), (lhs_pattern, rhs_pattern) ), FunctionCallPattern( Or([String(op) for op in operators]), (rhs_pattern, lhs_pattern) ), ] ) else: pattern = FunctionCallPattern( Or([String(op) for op in operators]), (lhs_pattern, rhs_pattern) ) return pattern
def get_time_range_estimate( query: ProcessableQuery[Table], ) -> Tuple[Optional[datetime], Optional[datetime]]: """ Best guess to find the time range for the query. We pick the first column that is compared with a datetime Literal. """ pattern = FunctionCall( Or([String(ConditionFunctions.GT), String(ConditionFunctions.GTE)]), (Column(None, Param("col_name", Any(str))), Literal(Any(datetime))), ) from_date, to_date = None, None condition = query.get_condition() if condition is None: return None, None for exp in condition: result = pattern.match(exp) if result is not None: from_date, to_date = get_time_range(query, result.string("col_name")) break return from_date, to_date
def get_time_range_expressions( conditions: Sequence[Expression], timestamp_field: str, table_name: Optional[str] = None, ) -> Tuple[Optional[Tuple[datetime, FunctionCallExpr]], Optional[Tuple[ datetime, FunctionCallExpr]], ]: max_lower_bound: Optional[Tuple[datetime, FunctionCallExpr]] = None min_upper_bound: Optional[Tuple[datetime, FunctionCallExpr]] = None table_match = String(table_name) if table_name else None for c in conditions: match = FunctionCall( Param( "operator", Or([ String(OPERATOR_TO_FUNCTION[">="]), String(OPERATOR_TO_FUNCTION["<"]), ]), ), ( Column(table_match, String(timestamp_field)), Literal(Param("timestamp", Any(datetime))), ), ).match(c) if match is not None: timestamp = cast(datetime, match.scalar("timestamp")) assert isinstance(c, FunctionCallExpr) if match.string("operator") == OPERATOR_TO_FUNCTION[">="]: if not max_lower_bound or timestamp > max_lower_bound[0]: max_lower_bound = (timestamp, c) else: if not min_upper_bound or timestamp < min_upper_bound[0]: min_upper_bound = (timestamp, c) return (max_lower_bound, min_upper_bound)
def test_accessors() -> None: func = FunctionCall( String("f_name"), ( FunctionCall(String("f"), (Column(None, String("my_col")), )), Param( "second_function", FunctionCall(Param("second_function_name", Any(str)), None), ), ), ) result = func.match( FunctionCallExpr( "irrelevant", "f_name", ( FunctionCallExpr(None, "f", (ColumnExpr(None, None, "my_col"), )), FunctionCallExpr(None, "second_name", tuple()), ), )) assert result is not None assert result.expression("second_function") == FunctionCallExpr( None, "second_name", tuple()) assert result.scalar("second_function_name") == "second_name"
def replace_condition(expression: Expression) -> Expression: match = FunctionCall( String(OPERATOR_TO_FUNCTION[operator]), (Param("column", Column(None, String(field))), AnyExpression()), ).match(expression) return (expression if match is None else replace( expression, parameters=(match.expression("column"), new_operand)))
def __set_condition_pattern(lhs: Pattern[Expression], operator: str) -> FunctionCallPattern: return FunctionCallPattern( String(operator), ( Param("lhs", lhs), Param("tuple", FunctionCallPattern(String("tuple"), None)), ), )
def replace_exp(exp: Expression) -> Expression: matcher = FunctionCall( String("notEquals"), (Column(None, String("type")), Literal(String("transaction"))), ) if matcher.match(exp): return LiteralExpr(None, 1) return exp
def get_project_ids_in_condition( condition: Expression) -> Optional[Set[int]]: """ Extract project ids from an expression. Returns None if no project if condition is found. It returns an empty set of conflicting project_id conditions are found. """ match = FunctionCall( None, String(ConditionFunctions.EQ), ( Column(column_name=String(project_column)), Literal(value=Param("project_id", Any(int))), ), ).match(condition) if match is not None: return {match.integer("project_id")} match = is_in_condition_pattern( Column(column_name=String(project_column))).match(condition) if match is not None: projects = match.expression("tuple") assert isinstance(projects, FunctionCallExpr) return { l.value for l in projects.parameters if isinstance(l, LiteralExpr) and isinstance(l.value, int) } match = FunctionCall( None, Param( "operator", Or([String(BooleanFunctions.AND), String(BooleanFunctions.OR)]), ), (Param("lhs", AnyExpression()), Param("rhs", AnyExpression())), ).match(condition) if match is not None: lhs_projects = get_project_ids_in_condition( match.expression("lhs")) rhs_projects = get_project_ids_in_condition( match.expression("rhs")) if lhs_projects is None: return rhs_projects elif rhs_projects is None: return lhs_projects else: return (lhs_projects & rhs_projects if match.string("operator") == BooleanFunctions.AND else lhs_projects | rhs_projects) return None
def __init__( self, column_name: str, key_names: Sequence[str], val_names: Sequence[str], ): super().__init__(column_name, key_names, val_names) self.__array_join_pattern = FunctionCall( String("arrayJoin"), (Column(column_name=Param( "col", Or([String(column) for column in self.all_columns]), ), ), ), )
def extract_granularity_from_query(query: Query, column: str) -> Optional[int]: """ This extracts the `granularity` from the `groupby` statement of the query. The matches are essentially the reverse of `TimeSeriesProcessor.__group_time_function`. """ groupby = query.get_groupby() column_match = ColumnMatch(None, String(column)) fn_match = FunctionCallMatch( Param( "time_fn", Or( [ String("toStartOfHour"), String("toStartOfMinute"), String("toStartOfDay"), String("toDate"), ] ), ), (column_match,), with_optionals=True, ) expr_match = FunctionCallMatch( String("toDateTime"), ( FunctionCallMatch( String("multiply"), ( FunctionCallMatch( String("intDiv"), ( FunctionCallMatch(String("toUInt32"), (column_match,)), LiteralMatch(Param("granularity", Any(int))), ), ), LiteralMatch(Param("granularity", Any(int))), ), ), LiteralMatch(Any(str)), ), ) for top_expr in groupby: for expr in top_expr: result = fn_match.match(expr) if result is not None: return GRANULARITY_MAPPING[result.string("time_fn")] result = expr_match.match(expr) if result is not None: return result.integer("granularity") return None
def process_query(self, query: Query, query_settings: QuerySettings) -> None: having_clause = query.get_having() if not having_clause: return None selected_columns = query.get_selected_columns() uniq_matcher = Param("function", FunctionCallMatch(String("uniq"))) found_functions = [] for exp in having_clause: match = uniq_matcher.match(exp) if match is not None: found_functions.append(match.expression("function")) if found_functions is not None: matcher = _ExpressionOrAliasMatcher(found_functions) for col in selected_columns: col.expression.accept(matcher) if not all(matcher.found_expressions): should_throw = get_config("throw_on_uniq_select_and_having", False) error = MismatchedAggregationException( "Aggregation is in HAVING clause but not SELECT", query=str(query)) if should_throw: raise error else: logging.warning( "Aggregation is in HAVING clause but not SELECT", exc_info=True, extra=cast(Dict[str, Any], error.to_dict()), )
def formatted_uuid_pattern(self, suffix: str = "") -> FunctionCallMatch: return FunctionCallMatch( String("replaceAll"), ( FunctionCallMatch( String("toString"), ( Param( "formatted_uuid_column" + suffix, ColumnMatch(None, self.__uuid_column_match), ), ), ), ), with_optionals=True, )
def is_skippable_condition(conditions: Expression) -> bool: """ A condition composed of a bunch of has(column, ...) conditions OR'ed together can be ignored when looking for filter keys because these are the conditions used for the bloom filter index on the array column. """ for column_name in column_names: has_pattern = FunctionCall( String("has"), (Column(column_name=String(column_name)), Literal(Any(str))), ) if all( has_pattern.match(c) for c in get_first_level_or_conditions(conditions) ): return True return False
def array_join_pattern(*column_names: str) -> FunctionCall: if len(column_names) == 1: return _array_join_pattern(column_names[0]) return FunctionCall( String("tuple"), tuple(_array_join_pattern(column_name) for column_name in column_names), )
def process_query(self, query: Query, request_settings: RequestSettings) -> None: # We care only of promoted contexts, so we do not need to match # the original nested expression. matcher = FunctionCall( String("toString"), ( Column( None, Or( [ String("device_simulator"), String("device_online"), String("device_charging"), ] ), ), ), ) def replace_exp(exp: Expression) -> Expression: if matcher.match(exp) is not None: inner = replace(exp, alias=None) return FunctionCallExpr( exp.alias, "multiIf", ( binary_condition( None, ConditionFunctions.EQ, inner, Literal(None, "") ), Literal(None, ""), binary_condition( None, ConditionFunctions.IN, inner, literals_tuple( None, [Literal(None, "1"), Literal(None, "True")] ), ), Literal(None, "True"), Literal(None, "False"), ), ) return exp query.transform_expressions(replace_exp)
def get_time_range( query: Query, timestamp_field: str) -> Tuple[Optional[datetime], Optional[datetime]]: """ Finds the minimal time range for this query. Which means, it finds the >= timestamp condition with the highest datetime literal and the < timestamp condition with the smallest and returns the interval in the form of a tuple of Literals. It only looks into first level AND conditions since, if the timestamp is nested in an OR we cannot say anything on how that compares to the other timestamp conditions. """ condition_clause = query.get_condition_from_ast() if not condition_clause: return (None, None) max_lower_bound = None min_upper_bound = None for c in get_first_level_and_conditions(condition_clause): match = FunctionCall( None, Param( "operator", Or([ String(OPERATOR_TO_FUNCTION[">="]), String(OPERATOR_TO_FUNCTION["<"]), ]), ), ( Column(None, None, String(timestamp_field)), Literal(None, Param("timestamp", Any(datetime))), ), ).match(c) if match is not None: timestamp = cast(datetime, match.scalar("timestamp")) if match.string("operator") == OPERATOR_TO_FUNCTION[">="]: if not max_lower_bound or timestamp > max_lower_bound: max_lower_bound = timestamp else: if not min_upper_bound or timestamp < min_upper_bound: min_upper_bound = timestamp return (max_lower_bound, min_upper_bound)
def extractor(condition: Expression) -> Set[str]: match = FunctionCall( String(ConditionFunctions.EQ), (key_pattern, Literal(Param("key", Any(str)))), ).match(condition) if match is None: return set() return {match.string("key")}
def test_binary_match() -> None: c1 = binary_condition( ConditionFunctions.EQ, Column(None, "table1", "column1"), Literal(None, "test"), ) lhs = ColumnPattern(String("table1"), String("column1")) rhs = LiteralPattern(String("test")) assert ( condition_pattern({ConditionFunctions.EQ}, lhs, rhs, True).match(c1) is not None ) assert ( condition_pattern({ConditionFunctions.EQ}, lhs, rhs, False).match(c1) is not None ) assert ( condition_pattern({ConditionFunctions.EQ}, rhs, lhs, True).match(c1) is not None ) assert condition_pattern({ConditionFunctions.EQ}, rhs, lhs, False).match(c1) is None
def __init__(self, column_name: str, hash_map_name: str, killswitch: str) -> None: self.__column_name = column_name self.__hash_map_name = hash_map_name self.__killswitch = killswitch # TODO: Add the support for IN connditions. self.__optimizable_pattern = FunctionCall( function_name=String("equals"), parameters=( Or([ mapping_pattern, FunctionCall( function_name=String("ifNull"), parameters=(mapping_pattern, Literal(String(""))), ), ]), Param("right_hand_side", Literal(Any(str))), ), )
def __init__(self, array_columns: Sequence[str]): self.__array_has_pattern = FunctionCall( String("equals"), ( Param( "has", FunctionCall( String("has"), ( Column( column_name=Or( [String(column) for column in array_columns] ) ), Literal(Any(str)), ), ), ), Literal(Integer(1)), ), )
def build_match( col: str, ops: Sequence[str], param_type: Any, alias: Optional[str] = None ) -> Or[Expression]: # The IN condition has to be checked separately since each parameter # has to be checked individually. alias_match = AnyOptionalString() if alias is None else String(alias) column_match = Param("column", ColumnPattern(alias_match, String(col))) return Or( [ FunctionCallPattern( Or([String(op) for op in ops]), (column_match, Param("rhs", LiteralPattern(AnyPattern(param_type)))), ), FunctionCallPattern( String(ConditionFunctions.IN), ( column_match, Param( "rhs", FunctionCallPattern( Or([String("array"), String("tuple")]), all_parameters=LiteralPattern(AnyPattern(param_type)), ), ), ), ), ] )
def __init__(self, column_name: str, hash_map_name: str, killswitch: str) -> None: self.__column_name = column_name self.__hash_map_name = hash_map_name self.__killswitch = killswitch # TODO: Add the support for IN conditions. self.__optimizable_pattern = FunctionCall( function_name=String("equals"), parameters=( Or( [ mapping_pattern, FunctionCall( function_name=String("ifNull"), parameters=(mapping_pattern, Literal(String(""))), ), ] ), Param("right_hand_side", Literal(Any(str))), ), ) self.__tag_exists_patterns = [ FunctionCall( function_name=String("notEquals"), parameters=( Or( [ mapping_pattern, FunctionCall( function_name=String("ifNull"), parameters=(mapping_pattern, Literal(String(""))), ), ] ), Param("right_hand_side", Literal(Any(str))), ), ), FunctionCall( function_name=String("has"), parameters=( ColumnMatcher( Param(TABLE_MAPPING_PARAM, AnyOptionalString()), Param(VALUE_COL_MAPPING_PARAM, String(f"{column_name}.key")), ), Literal(Param(KEY_MAPPING_PARAM, Any(str))), ), ), ]
def extractor(condition: Expression) -> Set[Tuple[str, ...]]: match = FunctionCall( String(ConditionFunctions.EQ), (key_pattern, Param("tuple", FunctionCall(String("tuple"), None))), ).match(condition) if match is None: return set() function = match.expression("tuple") if ( not isinstance(function, FunctionCallExpr) or function.function_name != "tuple" ): return set() parameters = tuple( param.value for param in function.parameters if isinstance(param, LiteralExpr) and isinstance(param.value, str) ) return {parameters}
def process_query(self, query: Query, request_settings: RequestSettings) -> None: matcher = FunctionCall( String("arrayElement"), ( Column( None, String("contexts.value"), ), FunctionCall( String("indexOf"), ( Column(None, String("contexts.key")), Literal( Or([ String("device.simulator"), String("device.online"), String("device.charging"), ]), ), ), ), ), ) def process_column(exp: Expression) -> Expression: match = matcher.match(exp) if match: inner = replace(exp, alias=None) return FunctionCallExpr( exp.alias, "if", ( binary_condition( ConditionFunctions.IN, inner, literals_tuple( None, [ LiteralExpr(None, "1"), LiteralExpr(None, "True") ], ), ), LiteralExpr(None, "True"), LiteralExpr(None, "False"), ), ) return exp query.transform_expressions(process_column)
def test_not_in_condition() -> None: not_in_condition = binary_condition( ConditionFunctions.NOT_IN, Column(None, None, "tags_key"), literals_tuple(None, [Literal(None, "t1"), Literal(None, "t2")]), ) assert is_not_in_condition(not_in_condition) match = is_not_in_condition_pattern(ColumnPattern(None, String("tags_key"))).match( not_in_condition ) assert match is not None assert match.expression("tuple") == literals_tuple( None, [Literal(None, "t1"), Literal(None, "t2")] ) assert match.expression("lhs") == Column(None, None, "tags_key")
def _get_mapping_keys_in_condition( condition: Expression, column_name: str ) -> Optional[Set[str]]: """ Finds the top level conditions that include filter based on the arrayJoin. This is meant to be used to find the keys the query is filtering the arrayJoin on. We can only apply the arrayFilter optimization to arrayJoin conditions that are not in OR with other columns. To simplify the problem, we only consider those conditions that are included in the first level of the query: [['tagskey' '=' 'a'],['col' '=' 'b'],['col2' '=' 'c']] works [[['tagskey' '=' 'a'], ['col2' '=' 'b']], ['tagskey' '=' 'c']] does not If we encounter an OR condition we return None, which means we cannot safely apply the optimization. Empty set means we did not find any suitable arrayJoin for optimization in this condition but that does not disqualify the whole query in the way the OR condition does. """ keys_found = set() conditions = get_first_level_and_conditions(condition) for c in conditions: if is_binary_condition(c, BooleanFunctions.OR): return None match = FunctionCall( None, String(ConditionFunctions.EQ), (array_join_pattern(column_name), Literal(None, Param("key", Any(str)))), ).match(c) if match is not None: keys_found.add(match.string("key")) match = is_in_condition_pattern(array_join_pattern(column_name)).match(c) if match is not None: function = match.expression("tuple") assert isinstance(function, FunctionCallExpr) keys_found |= { lit.value for lit in function.parameters if isinstance(lit, LiteralExpr) and isinstance(lit.value, str) } return keys_found
def build_match( col: str, ops: Sequence[str], param_type: Any, alias: Optional[str] = None, key: Optional[str] = None, ) -> Or[Expression]: # The IN condition has to be checked separately since each parameter # has to be checked individually. alias_match = AnyOptionalString() if alias is None else String(alias) pattern: Union[ColumnPattern, SubscriptableReferencePattern] if key is not None: pattern = SubscriptableReferencePattern(table_name=alias_match, column_name=String(col), key=String(key)) else: pattern = ColumnPattern(table_name=alias_match, column_name=String(col)) column_match = Param("column", pattern) return Or([ FunctionCallPattern( Or([String(op) for op in ops]), (column_match, Param("rhs", LiteralPattern( AnyPattern(param_type)))), ), FunctionCallPattern( String(ConditionFunctions.IN), ( column_match, Param( "rhs", FunctionCallPattern( Or([String("array"), String("tuple")]), all_parameters=LiteralPattern(AnyPattern(param_type)), ), ), ), ), ])
def __init__(self, columns: Set[str]): self.columns = columns column_match = Or([String(col) for col in columns]) literal = Param("literal", LiteralMatch(AnyMatch(str))) operator = Param( "operator", Or( [ String(op) for op in FUNCTION_TO_OPERATOR if op not in (ConditionFunctions.IN, ConditionFunctions.NOT_IN) ] ), ) in_operators = Param( "operator", Or((String(ConditionFunctions.IN), String(ConditionFunctions.NOT_IN))), ) col = Param("col", ColumnMatch(None, column_match)) self.__condition_matcher = Or( [ FunctionCallMatch(operator, (literal, col)), FunctionCallMatch(operator, (col, literal)), FunctionCallMatch(Param("operator", String("has")), (col, literal)), ] ) self.__in_condition_matcher = FunctionCallMatch( in_operators, ( col, Param( "tuple", FunctionCallMatch(String("tuple"), all_parameters=LiteralMatch()), ), ), )