示例#1
0
def test_accessors() -> None:
    func = FunctionCall(
        String("f_name"),
        (
            FunctionCall(String("f"), (Column(None, String("my_col")), )),
            Param(
                "second_function",
                FunctionCall(Param("second_function_name", Any(str)), None),
            ),
        ),
    )

    result = func.match(
        FunctionCallExpr(
            "irrelevant",
            "f_name",
            (
                FunctionCallExpr(None, "f",
                                 (ColumnExpr(None, None, "my_col"), )),
                FunctionCallExpr(None, "second_name", tuple()),
            ),
        ))

    assert result is not None
    assert result.expression("second_function") == FunctionCallExpr(
        None, "second_name", tuple())
    assert result.scalar("second_function_name") == "second_name"
示例#2
0
def _get_date_range(query: Query) -> Optional[int]:
    """
    Best guess to find the time range for the query.
    We pick the first column that is compared with a datetime Literal.
    """
    pattern = FunctionCall(
        Or([String(ConditionFunctions.GT),
            String(ConditionFunctions.GTE)]),
        (Column(None, Param("col_name", Any(str))), Literal(Any(datetime))),
    )

    condition = query.get_condition_from_ast()
    if condition is None:
        return None
    for exp in condition:
        result = pattern.match(exp)
        if result is not None:
            from_date, to_date = get_time_range(query,
                                                result.string("col_name"))
            if from_date is None or to_date is None:
                return None
            else:
                return (to_date - from_date).days

    return None
示例#3
0
def get_time_range_expressions(
    conditions: Sequence[Expression],
    timestamp_field: str,
    table_name: Optional[str] = None,
) -> Tuple[Optional[Tuple[datetime, FunctionCallExpr]], Optional[Tuple[
        datetime, FunctionCallExpr]], ]:
    max_lower_bound: Optional[Tuple[datetime, FunctionCallExpr]] = None
    min_upper_bound: Optional[Tuple[datetime, FunctionCallExpr]] = None
    table_match = String(table_name) if table_name else None
    for c in conditions:
        match = FunctionCall(
            Param(
                "operator",
                Or([
                    String(OPERATOR_TO_FUNCTION[">="]),
                    String(OPERATOR_TO_FUNCTION["<"]),
                ]),
            ),
            (
                Column(table_match, String(timestamp_field)),
                Literal(Param("timestamp", Any(datetime))),
            ),
        ).match(c)

        if match is not None:
            timestamp = cast(datetime, match.scalar("timestamp"))
            assert isinstance(c, FunctionCallExpr)
            if match.string("operator") == OPERATOR_TO_FUNCTION[">="]:
                if not max_lower_bound or timestamp > max_lower_bound[0]:
                    max_lower_bound = (timestamp, c)
            else:
                if not min_upper_bound or timestamp < min_upper_bound[0]:
                    min_upper_bound = (timestamp, c)

    return (max_lower_bound, min_upper_bound)
示例#4
0
def get_time_range_estimate(
    query: ProcessableQuery[Table],
) -> Tuple[Optional[datetime], Optional[datetime]]:
    """
    Best guess to find the time range for the query.
    We pick the first column that is compared with a datetime Literal.
    """
    pattern = FunctionCall(
        Or([String(ConditionFunctions.GT),
            String(ConditionFunctions.GTE)]),
        (Column(None, Param("col_name", Any(str))), Literal(Any(datetime))),
    )

    from_date, to_date = None, None
    condition = query.get_condition()
    if condition is None:
        return None, None
    for exp in condition:
        result = pattern.match(exp)
        if result is not None:
            from_date, to_date = get_time_range(query,
                                                result.string("col_name"))
            break

    return from_date, to_date
 def __init__(self, uuid_columns: Set[str]) -> None:
     self.__unique_uuid_columns = uuid_columns
     self.__uuid_column_match = Or([String(u_col) for u_col in uuid_columns])
     self.uuid_in_condition = FunctionCallMatch(
         Or((String(ConditionFunctions.IN), String(ConditionFunctions.NOT_IN))),
         (
             self.formatted_uuid_pattern(),
             Param("params", FunctionCallMatch(String("tuple"), None)),
         ),
     )
     self.uuid_condition = FunctionCallMatch(
         Or(
             [
                 String(op)
                 for op in FUNCTION_TO_OPERATOR
                 if op not in (ConditionFunctions.IN, ConditionFunctions.NOT_IN)
             ]
         ),
         (
             Or(
                 (
                     Param("literal_0", LiteralMatch(AnyOptionalString())),
                     self.formatted_uuid_pattern("_0"),
                 )
             ),
             Or(
                 (
                     Param("literal_1", LiteralMatch(AnyOptionalString())),
                     self.formatted_uuid_pattern("_1"),
                 )
             ),
         ),
     )
     self.formatted: Optional[str] = None
示例#6
0
    def replace_condition(expression: Expression) -> Expression:
        match = FunctionCall(
            String(OPERATOR_TO_FUNCTION[operator]),
            (Param("column", Column(None, String(field))), AnyExpression()),
        ).match(expression)

        return (expression if match is None else replace(
            expression, parameters=(match.expression("column"), new_operand)))
    def extractor(condition: Expression) -> Set[str]:
        match = FunctionCall(
            String(ConditionFunctions.EQ),
            (key_pattern, Literal(Param("key", Any(str)))),
        ).match(condition)

        if match is None:
            return set()

        return {match.string("key")}
示例#8
0
        def replace_exp(exp: Expression) -> Expression:
            matcher = FunctionCall(
                String("notEquals"),
                (Column(None, String("type")), Literal(String("transaction"))),
            )

            if matcher.match(exp):
                return LiteralExpr(None, 1)

            return exp
示例#9
0
def extract_granularity_from_query(query: Query, column: str) -> Optional[int]:
    """
    This extracts the `granularity` from the `groupby` statement of the query.
    The matches are essentially the reverse of `TimeSeriesProcessor.__group_time_function`.
    """
    groupby = query.get_groupby()

    column_match = ColumnMatch(None, String(column))
    fn_match = FunctionCallMatch(
        Param(
            "time_fn",
            Or(
                [
                    String("toStartOfHour"),
                    String("toStartOfMinute"),
                    String("toStartOfDay"),
                    String("toDate"),
                ]
            ),
        ),
        (column_match,),
        with_optionals=True,
    )
    expr_match = FunctionCallMatch(
        String("toDateTime"),
        (
            FunctionCallMatch(
                String("multiply"),
                (
                    FunctionCallMatch(
                        String("intDiv"),
                        (
                            FunctionCallMatch(String("toUInt32"), (column_match,)),
                            LiteralMatch(Param("granularity", Any(int))),
                        ),
                    ),
                    LiteralMatch(Param("granularity", Any(int))),
                ),
            ),
            LiteralMatch(Any(str)),
        ),
    )

    for top_expr in groupby:
        for expr in top_expr:
            result = fn_match.match(expr)
            if result is not None:
                return GRANULARITY_MAPPING[result.string("time_fn")]

            result = expr_match.match(expr)
            if result is not None:
                return result.integer("granularity")

    return None
示例#10
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        matcher = FunctionCall(
            String("arrayElement"),
            (
                Column(
                    None,
                    String("contexts.value"),
                ),
                FunctionCall(
                    String("indexOf"),
                    (
                        Column(None, String("contexts.key")),
                        Literal(
                            Or([
                                String("device.simulator"),
                                String("device.online"),
                                String("device.charging"),
                            ]), ),
                    ),
                ),
            ),
        )

        def process_column(exp: Expression) -> Expression:
            match = matcher.match(exp)

            if match:
                inner = replace(exp, alias=None)
                return FunctionCallExpr(
                    exp.alias,
                    "if",
                    (
                        binary_condition(
                            ConditionFunctions.IN,
                            inner,
                            literals_tuple(
                                None,
                                [
                                    LiteralExpr(None, "1"),
                                    LiteralExpr(None, "True")
                                ],
                            ),
                        ),
                        LiteralExpr(None, "True"),
                        LiteralExpr(None, "False"),
                    ),
                )

            return exp

        query.transform_expressions(process_column)
示例#11
0
 def __init__(
     self,
     column_name: str,
     key_names: Sequence[str],
     val_names: Sequence[str],
 ):
     super().__init__(column_name, key_names, val_names)
     self.__array_join_pattern = FunctionCall(
         String("arrayJoin"),
         (Column(column_name=Param(
             "col",
             Or([String(column) for column in self.all_columns]),
         ), ), ),
     )
示例#12
0
    def get_project_ids_in_condition(
            condition: Expression) -> Optional[Set[int]]:
        """
        Extract project ids from an expression. Returns None if no project
        if condition is found. It returns an empty set of conflicting project_id
        conditions are found.
        """
        match = FunctionCall(
            None,
            String(ConditionFunctions.EQ),
            (
                Column(column_name=String(project_column)),
                Literal(value=Param("project_id", Any(int))),
            ),
        ).match(condition)
        if match is not None:
            return {match.integer("project_id")}

        match = is_in_condition_pattern(
            Column(column_name=String(project_column))).match(condition)
        if match is not None:
            projects = match.expression("tuple")
            assert isinstance(projects, FunctionCallExpr)
            return {
                l.value
                for l in projects.parameters
                if isinstance(l, LiteralExpr) and isinstance(l.value, int)
            }

        match = FunctionCall(
            None,
            Param(
                "operator",
                Or([String(BooleanFunctions.AND),
                    String(BooleanFunctions.OR)]),
            ),
            (Param("lhs", AnyExpression()), Param("rhs", AnyExpression())),
        ).match(condition)
        if match is not None:
            lhs_projects = get_project_ids_in_condition(
                match.expression("lhs"))
            rhs_projects = get_project_ids_in_condition(
                match.expression("rhs"))
            if lhs_projects is None:
                return rhs_projects
            elif rhs_projects is None:
                return lhs_projects
            else:
                return (lhs_projects & rhs_projects if match.string("operator")
                        == BooleanFunctions.AND else lhs_projects
                        | rhs_projects)

        return None
示例#13
0
class ArrayHasOptimizer(QueryProcessor):
    def __init__(self, array_columns: Sequence[str]):
        self.__array_has_pattern = FunctionCall(
            String("equals"),
            (
                Param(
                    "has",
                    FunctionCall(
                        String("has"),
                        (
                            Column(
                                column_name=Or(
                                    [String(column) for column in array_columns]
                                )
                            ),
                            Literal(Any(str)),
                        ),
                    ),
                ),
                Literal(Integer(1)),
            ),
        )

    def process_query(self, query: Query, query_settings: QuerySettings) -> None:
        def replace_expression(expr: Expression) -> Expression:
            match = self.__array_has_pattern.match(expr)

            # The has condition we are looking for are not present, so skip this entirely
            if match is None:
                return expr

            return match.expression("has")

        query.transform_expressions(replace_expression)
 def is_skippable_condition(conditions: Expression) -> bool:
     """
     A condition composed of a bunch of has(column, ...) conditions OR'ed together
     can be ignored when looking for filter keys because these are the conditions
     used for the bloom filter index on the array column.
     """
     for column_name in column_names:
         has_pattern = FunctionCall(
             String("has"),
             (Column(column_name=String(column_name)), Literal(Any(str))),
         )
         if all(
             has_pattern.match(c) for c in get_first_level_or_conditions(conditions)
         ):
             return True
     return False
def array_join_pattern(*column_names: str) -> FunctionCall:
    if len(column_names) == 1:
        return _array_join_pattern(column_names[0])

    return FunctionCall(
        String("tuple"),
        tuple(_array_join_pattern(column_name) for column_name in column_names),
    )
示例#16
0
def get_time_range(
        query: Query,
        timestamp_field: str) -> Tuple[Optional[datetime], Optional[datetime]]:
    """
    Finds the minimal time range for this query. Which means, it finds
    the >= timestamp condition with the highest datetime literal and
    the < timestamp condition with the smallest and returns the interval
    in the form of a tuple of Literals. It only looks into first level
    AND conditions since, if the timestamp is nested in an OR we cannot
    say anything on how that compares to the other timestamp conditions.
    """

    condition_clause = query.get_condition_from_ast()
    if not condition_clause:
        return (None, None)

    max_lower_bound = None
    min_upper_bound = None
    for c in get_first_level_and_conditions(condition_clause):
        match = FunctionCall(
            None,
            Param(
                "operator",
                Or([
                    String(OPERATOR_TO_FUNCTION[">="]),
                    String(OPERATOR_TO_FUNCTION["<"]),
                ]),
            ),
            (
                Column(None, None, String(timestamp_field)),
                Literal(None, Param("timestamp", Any(datetime))),
            ),
        ).match(c)

        if match is not None:
            timestamp = cast(datetime, match.scalar("timestamp"))
            if match.string("operator") == OPERATOR_TO_FUNCTION[">="]:
                if not max_lower_bound or timestamp > max_lower_bound:
                    max_lower_bound = timestamp
            else:
                if not min_upper_bound or timestamp < min_upper_bound:
                    min_upper_bound = timestamp

    return (max_lower_bound, min_upper_bound)
def _get_mapping_keys_in_condition(
    condition: Expression, column_name: str
) -> Optional[Set[str]]:
    """
    Finds the top level conditions that include filter based on the arrayJoin.
    This is meant to be used to find the keys the query is filtering the arrayJoin
    on.
    We can only apply the arrayFilter optimization to arrayJoin conditions
    that are not in OR with other columns. To simplify the problem, we only
    consider those conditions that are included in the first level of the query:
    [['tagskey' '=' 'a'],['col' '=' 'b'],['col2' '=' 'c']]  works
    [[['tagskey' '=' 'a'], ['col2' '=' 'b']], ['tagskey' '=' 'c']] does not

    If we encounter an OR condition we return None, which means we cannot
    safely apply the optimization. Empty set means we did not find any
    suitable arrayJoin for optimization in this condition but that does
    not disqualify the whole query in the way the OR condition does.
    """
    keys_found = set()

    conditions = get_first_level_and_conditions(condition)
    for c in conditions:
        if is_binary_condition(c, BooleanFunctions.OR):
            return None

        match = FunctionCall(
            None,
            String(ConditionFunctions.EQ),
            (array_join_pattern(column_name), Literal(None, Param("key", Any(str)))),
        ).match(c)
        if match is not None:
            keys_found.add(match.string("key"))

        match = is_in_condition_pattern(array_join_pattern(column_name)).match(c)
        if match is not None:
            function = match.expression("tuple")
            assert isinstance(function, FunctionCallExpr)
            keys_found |= {
                lit.value
                for lit in function.parameters
                if isinstance(lit, LiteralExpr) and isinstance(lit.value, str)
            }

    return keys_found
示例#18
0
    def __init__(self, column_name: str, hash_map_name: str,
                 killswitch: str) -> None:
        self.__column_name = column_name
        self.__hash_map_name = hash_map_name
        self.__killswitch = killswitch

        # TODO: Add the support for IN connditions.
        self.__optimizable_pattern = FunctionCall(
            function_name=String("equals"),
            parameters=(
                Or([
                    mapping_pattern,
                    FunctionCall(
                        function_name=String("ifNull"),
                        parameters=(mapping_pattern, Literal(String(""))),
                    ),
                ]),
                Param("right_hand_side", Literal(Any(str))),
            ),
        )
示例#19
0
 def __init__(self, array_columns: Sequence[str]):
     self.__array_has_pattern = FunctionCall(
         String("equals"),
         (
             Param(
                 "has",
                 FunctionCall(
                     String("has"),
                     (
                         Column(
                             column_name=Or(
                                 [String(column) for column in array_columns]
                             )
                         ),
                         Literal(Any(str)),
                     ),
                 ),
             ),
             Literal(Integer(1)),
         ),
     )
    def extractor(condition: Expression) -> Set[Tuple[str, ...]]:
        match = FunctionCall(
            String(ConditionFunctions.EQ),
            (key_pattern, Param("tuple", FunctionCall(String("tuple"), None))),
        ).match(condition)

        if match is None:
            return set()

        function = match.expression("tuple")
        if (
            not isinstance(function, FunctionCallExpr)
            or function.function_name != "tuple"
        ):
            return set()

        parameters = tuple(
            param.value
            for param in function.parameters
            if isinstance(param, LiteralExpr) and isinstance(param.value, str)
        )

        return {parameters}
示例#21
0
    def process_query(self, query: Query, request_settings: RequestSettings) -> None:
        # We care only of promoted contexts, so we do not need to match
        # the original nested expression.
        matcher = FunctionCall(
            String("toString"),
            (
                Column(
                    None,
                    Or(
                        [
                            String("device_simulator"),
                            String("device_online"),
                            String("device_charging"),
                        ]
                    ),
                ),
            ),
        )

        def replace_exp(exp: Expression) -> Expression:
            if matcher.match(exp) is not None:
                inner = replace(exp, alias=None)
                return FunctionCallExpr(
                    exp.alias,
                    "multiIf",
                    (
                        binary_condition(
                            None, ConditionFunctions.EQ, inner, Literal(None, "")
                        ),
                        Literal(None, ""),
                        binary_condition(
                            None,
                            ConditionFunctions.IN,
                            inner,
                            literals_tuple(
                                None, [Literal(None, "1"), Literal(None, "True")]
                            ),
                        ),
                        Literal(None, "True"),
                        Literal(None, "False"),
                    ),
                )
            return exp

        query.transform_expressions(replace_exp)
示例#22
0
    def __init__(self, columns: Set[str]):
        self.columns = columns
        column_match = Or([String(col) for col in columns])

        literal = Param("literal", LiteralMatch(AnyMatch(str)))

        operator = Param(
            "operator",
            Or(
                [
                    String(op)
                    for op in FUNCTION_TO_OPERATOR
                    if op not in (ConditionFunctions.IN, ConditionFunctions.NOT_IN)
                ]
            ),
        )

        in_operators = Param(
            "operator",
            Or((String(ConditionFunctions.IN), String(ConditionFunctions.NOT_IN))),
        )

        col = Param("col", ColumnMatch(None, column_match))

        self.__condition_matcher = Or(
            [
                FunctionCallMatch(operator, (literal, col)),
                FunctionCallMatch(operator, (col, literal)),
                FunctionCallMatch(Param("operator", String("has")), (col, literal)),
            ]
        )

        self.__in_condition_matcher = FunctionCallMatch(
            in_operators,
            (
                col,
                Param(
                    "tuple",
                    FunctionCallMatch(String("tuple"), all_parameters=LiteralMatch()),
                ),
            ),
        )
示例#23
0
class DefaultNoneFunctionMapper(FunctionCallMapper):
    """
    Maps the list of function names to NULL.
    """

    function_names: Set[str]

    def __post_init__(self) -> None:
        self.function_match = FunctionCallMatch(
            Or([StringMatch(func) for func in self.function_names]))

    def attempt_map(
        self,
        expression: FunctionCall,
        children_translator: SnubaClickhouseStrictTranslator,
    ) -> Optional[FunctionCall]:
        if self.function_match.match(expression):
            return identity(Literal(None, None), expression.alias)

        return None
示例#24
0
    def __init__(self, column_name: str, hash_map_name: str, killswitch: str) -> None:
        self.__column_name = column_name
        self.__hash_map_name = hash_map_name
        self.__killswitch = killswitch

        # TODO: Add the support for IN conditions.
        self.__optimizable_pattern = FunctionCall(
            function_name=String("equals"),
            parameters=(
                Or(
                    [
                        mapping_pattern,
                        FunctionCall(
                            function_name=String("ifNull"),
                            parameters=(mapping_pattern, Literal(String(""))),
                        ),
                    ]
                ),
                Param("right_hand_side", Literal(Any(str))),
            ),
        )
        self.__tag_exists_patterns = [
            FunctionCall(
                function_name=String("notEquals"),
                parameters=(
                    Or(
                        [
                            mapping_pattern,
                            FunctionCall(
                                function_name=String("ifNull"),
                                parameters=(mapping_pattern, Literal(String(""))),
                            ),
                        ]
                    ),
                    Param("right_hand_side", Literal(Any(str))),
                ),
            ),
            FunctionCall(
                function_name=String("has"),
                parameters=(
                    ColumnMatcher(
                        Param(TABLE_MAPPING_PARAM, AnyOptionalString()),
                        Param(VALUE_COL_MAPPING_PARAM, String(f"{column_name}.key")),
                    ),
                    Literal(Param(KEY_MAPPING_PARAM, Any(str))),
                ),
            ),
        ]
def _array_join_pattern(column_name: str) -> FunctionCall:
    return FunctionCall(
        String("arrayJoin"),
        (Column(column_name=String(column_name)),),
    )
def find_pattern(query: Query, pattern: FunctionCall) -> bool:
    return any(
        pattern.match(f) is not None
        for selected in query.get_selected_columns() or []
        for f in selected.expression
    )
示例#27
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        arrayjoin_pattern = FunctionCall(
            String("arrayJoin"),
            (Column(column_name=Param(
                "col",
                Or([
                    String(key_column(self.__column_name)),
                    String(val_column(self.__column_name)),
                ]),
            ), ), ),
        )

        arrayjoins_in_query = set()
        for e in query.get_all_expressions():
            match = arrayjoin_pattern.match(e)
            if match is not None:
                arrayjoins_in_query.add(match.string("col"))

        filtered_keys = [
            LiteralExpr(None, key)
            for key in get_filtered_mapping_keys(query, self.__column_name)
        ]

        # Ensures the alias we apply to the arrayJoin is not already taken.
        used_aliases = {exp.alias for exp in query.get_all_expressions()}
        pair_alias_root = f"snuba_all_{self.__column_name}"
        pair_alias = pair_alias_root
        index = 0
        while pair_alias in used_aliases:
            index += 1
            pair_alias = f"{pair_alias_root}_{index}"

        def replace_expression(expr: Expression) -> Expression:
            """
            Applies the appropriate optimization on a single arrayJoin expression.
            """
            match = arrayjoin_pattern.match(expr)
            if match is None:
                return expr

            if arrayjoins_in_query == {
                    key_column(self.__column_name),
                    val_column(self.__column_name),
            }:
                # Both arrayJoin(col.key) and arrayJoin(col.value) expressions
                # present int the query. Do the arrayJoin on key-value pairs
                # instead of independent arrayjoin for keys and values.
                array_index = (LiteralExpr(
                    None, 1) if match.string("col") == key_column(
                        self.__column_name) else LiteralExpr(None, 2))

                if not filtered_keys:
                    return _unfiltered_mapping_pairs(expr.alias,
                                                     self.__column_name,
                                                     pair_alias, array_index)
                else:
                    return _filtered_mapping_pairs(
                        expr.alias,
                        self.__column_name,
                        pair_alias,
                        filtered_keys,
                        array_index,
                    )

            elif filtered_keys:
                # Only one between arrayJoin(col.key) and arrayJoin(col.value)
                # is present, and it is arrayJoin(col.key) since we found
                # filtered keys.
                return _filtered_mapping_keys(expr.alias, self.__column_name,
                                              filtered_keys)
            else:
                # No viable optimization
                return expr

        query.transform_expressions(replace_expression)
示例#28
0
 ),
 (
     "Do not Match a None string through Any",
     Column(Param("p_table_name", Any(type(None))), None),
     ColumnExpr("irrelevant", "not_none", "irrelevant"),
     None,
 ),
 (
     "Match any expression of Column type",
     Any(ColumnExpr),
     ColumnExpr("irrelevant", "irrelevant", "irrelevant"),
     MatchResult(),
 ),
 (
     "Match any expression of Column type within function",
     FunctionCall(None, (Param("p1", Any(ColumnExpr)), )),
     FunctionCallExpr(
         "irrelevant",
         "irrelevant",
         (ColumnExpr("relevant", "relevant", "relevant"), ),
     ),
     MatchResult({"p1": ColumnExpr("relevant", "relevant", "relevant")}),
 ),
 (
     "Wrong number of parameters, does not match",
     FunctionCall(None, (Param("p1", Any(ColumnExpr)), )),
     FunctionCallExpr(
         "irrelevant",
         "irrelevant",
         (
             ColumnExpr("relevant", "relevant", "relevant"),
示例#29
0
class MappingOptimizer(QueryProcessor):
    """
    Optimize tags conditions by relying on the tags_hash_map column.
    Such column is an array of hashes of `key=value` strings.
    This processor transforms tags conditions that are in the form of
    `tags.value[indexOf(tags.key, 'my_tag')] = 'my_val'`
    into
    `has(_tags_hash_map, cityHash64('my_tag=my_val'))`

    Supported use case:
    - direct equality. Example above
    - tags expression nested into ifNull conditions like:
      `ifNull('tags.value[indexOf(tags.key, 'my_tag')]', '') = ...`
    - tags conditions in both where and having

    Unsupported use cases:
    - everything that cannot be checked through the tags hash map
      like !=, LIKE, IS NULL
    - it will not optimize a condition if another condition still
      requires unpacking the tags column. Like
      `tags[a] = b AND tags[b] LIKE 'asd'`
      That would load an additional column for almost no gain thus
      actually degrading performance.
    - `ifNull('tags.value[indexOf(tags.key, 'my_tag')]', '') = ''`
       this condition is equivalent to looking whether a tag is
       missing, which cannot be done with the hash map.
    - IN conditions. TODO
    """

    def __init__(self, column_name: str, hash_map_name: str, killswitch: str) -> None:
        self.__column_name = column_name
        self.__hash_map_name = hash_map_name
        self.__killswitch = killswitch

        # TODO: Add the support for IN connditions.
        self.__optimizable_pattern = FunctionCall(
            function_name=String("equals"),
            parameters=(
                Or(
                    [
                        mapping_pattern,
                        FunctionCall(
                            function_name=String("ifNull"),
                            parameters=(mapping_pattern, Literal(String(""))),
                        ),
                    ]
                ),
                Param("right_hand_side", Literal(Any(str))),
            ),
        )

    def __classify_combined_conditions(self, condition: Expression) -> ConditionClass:
        if not isinstance(condition, FunctionExpr):
            return ConditionClass.IRRELEVANT
        elif condition.function_name in (BooleanFunctions.AND, BooleanFunctions.OR):
            conditions = (
                get_first_level_and_conditions(condition)
                if condition.function_name == BooleanFunctions.AND
                else get_first_level_or_conditions(condition)
            )
            classified = {self.__classify_combined_conditions(c) for c in conditions}
            if ConditionClass.NOT_OPTIMIZABLE in classified:
                return ConditionClass.NOT_OPTIMIZABLE
            elif ConditionClass.OPTIMIZABLE in classified:
                return ConditionClass.OPTIMIZABLE
            else:
                return ConditionClass.IRRELEVANT
        else:
            return self.__classify_condition(condition)

    def __classify_condition(self, condition: Expression) -> ConditionClass:
        # Expects this to be an individual condition
        match = self.__optimizable_pattern.match(condition)
        if (
            match is not None
            and match.string(KEY_COL_MAPPING_PARAM) == f"{self.__column_name}.key"
        ):
            rhs = match.expression("right_hand_side")
            assert isinstance(rhs, LiteralExpr)
            return (
                ConditionClass.NOT_OPTIMIZABLE
                # ifNull(tags[asd], '') = '' is not optimizable.
                if rhs.value == ""
                else ConditionClass.OPTIMIZABLE
            )
        elif match is None:
            # If this condition is not matching an optimizable condition,
            # check that it does not reference the optimizable column.
            # If it does, it means we should not optimize this query.
            for exp in condition:
                if isinstance(exp, Column) and exp.column_name in (
                    f"{self.__column_name}.key",
                    f"{self.__column_name}.value",
                ):
                    return ConditionClass.NOT_OPTIMIZABLE
            return ConditionClass.IRRELEVANT
        else:
            return ConditionClass.IRRELEVANT

    def __replace_with_hash(self, condition: Expression) -> Expression:
        match = self.__optimizable_pattern.match(condition)
        if (
            match is None
            or match.string(KEY_COL_MAPPING_PARAM) != f"{self.__column_name}.key"
        ):
            return condition
        rhs = match.expression("right_hand_side")
        assert isinstance(rhs, LiteralExpr)
        key = match.string(KEY_MAPPING_PARAM).translate(ESCAPE_TRANSLATION)
        return FunctionExpr(
            alias=condition.alias,
            function_name="has",
            parameters=(
                Column(
                    alias=None,
                    table_name=match.optional_string(TABLE_MAPPING_PARAM),
                    column_name=self.__hash_map_name,
                ),
                FunctionExpr(
                    alias=None,
                    function_name="cityHash64",
                    parameters=(LiteralExpr(None, f"{key}={rhs.value}"),),
                ),
            ),
        )

    def process_query(self, query: Query, request_settings: RequestSettings) -> None:
        if not get_config(self.__killswitch, 1):
            return

        cond_class = ConditionClass.IRRELEVANT
        condition = query.get_condition()
        if condition is not None:
            cond_class = self.__classify_combined_conditions(condition)
            if cond_class == ConditionClass.NOT_OPTIMIZABLE:
                return

        having_cond_class = ConditionClass.IRRELEVANT
        having_cond = query.get_having()
        if having_cond is not None:
            having_cond_class = self.__classify_combined_conditions(having_cond)
            if having_cond_class == ConditionClass.NOT_OPTIMIZABLE:
                return

        if not (
            cond_class == ConditionClass.OPTIMIZABLE
            or having_cond_class == ConditionClass.OPTIMIZABLE
        ):
            return

        metrics.increment("optimizable_query")

        if condition is not None:
            query.set_ast_condition(condition.transform(self.__replace_with_hash))
        if having_cond is not None:
            query.set_ast_having(having_cond.transform(self.__replace_with_hash))
示例#30
0
            LiteralExpr(None, None),
        ),
    )


TABLE_MAPPING_PARAM = "table_name"
VALUE_COL_MAPPING_PARAM = "value_column"
KEY_COL_MAPPING_PARAM = "key_column"
KEY_MAPPING_PARAM = "key"
mapping_pattern = FunctionCall(
    None,
    String("arrayElement"),
    (
        Column(
            None,
            Param(TABLE_MAPPING_PARAM, AnyOptionalString()),
            Param(VALUE_COL_MAPPING_PARAM, Any(str)),
        ),
        FunctionCall(
            None,
            String("indexOf"),
            (
                Column(None, None, Param(KEY_COL_MAPPING_PARAM, Any(str))),
                Literal(None, Param(KEY_MAPPING_PARAM, Any(str))),
            ),
        ),
    ),
)

# TODO: build more of these mappers.