Exemplo n.º 1
0
    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
    ) -> QueryResult:
        from_date_ast, to_date_ast = get_time_range(query, "timestamp")
        assert from_date_ast is not None and isinstance(
            from_date_ast, datetime)
        assert to_date_ast is not None and isinstance(to_date_ast, datetime)

        conditions = query.get_conditions() or []
        from_date_str = next(
            (condition[2] for condition in conditions
             if condition[0] == "timestamp" and condition[1] == ">="),
            None,
        )
        to_date_str = next(
            (condition[2] for condition in conditions
             if condition[0] == "timestamp" and condition[1] == "<"),
            None,
        )
        assert from_date_str == from_date_ast.isoformat()
        assert to_date_str == to_date_ast.isoformat()

        found_timestamps.append(
            (from_date_ast.isoformat(), to_date_ast.isoformat()))

        return QueryResult({"data": []}, {})
Exemplo n.º 2
0
def _replace_condition(query: Query, field: str, operator: str,
                       new_literal: Union[str, List[AnyType]]) -> None:
    query.set_conditions([
        cond if not _identify_condition(cond, field, operator) else
        [field, operator, new_literal]
        for cond in query.get_conditions() or []
    ])
Exemplo n.º 3
0
def test_without_turbo_without_projects_needing_final(
        query: ClickhouseQuery) -> None:
    PostReplacementConsistencyEnforcer("project_id", None).process_query(
        query, HTTPRequestSettings())

    assert query.get_conditions() == [("project_id", "IN", [2])]
    assert query.get_condition_from_ast() == build_in("project_id", [2])
    assert not query.get_final()
Exemplo n.º 4
0
    def _update_conditions(self, query: Query,
                           prewhere_conditions: Sequence[Condition]) -> None:
        conditions = query.get_conditions()
        # This should never ne None at this point, but for mypy this can be None.
        assert conditions is not None

        query.set_conditions(
            [cond for cond in conditions if cond not in prewhere_conditions])
        query.set_prewhere(prewhere_conditions)
Exemplo n.º 5
0
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 2)
    set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS)

    PostReplacementConsistencyEnforcer("project_id",
                                       ReplacerState.EVENTS).process_query(
                                           query, HTTPRequestSettings())

    assert query.get_conditions() == [("project_id", "IN", [2])]
    assert query.get_condition_from_ast() == build_in("project_id", [2])
    assert query.get_final()
Exemplo n.º 6
0
 def _get_prewhere_candidates(
     self, query: Query, prewhere_keys: Sequence[str]
 ) -> Sequence[Tuple[Iterable[str], Condition]]:
     # Add any condition to PREWHERE if:
     # - It is a single top-level condition (not OR-nested), and
     # - Any of its referenced columns are in prewhere_keys
     conditions = query.get_conditions()
     if not conditions:
         return []
     return [(util.columns_in_expr(cond[0]), cond) for cond in conditions
             if util.is_condition(cond) and cond[1] in ALLOWED_OPERATORS
             and any(col in prewhere_keys
                     for col in util.columns_in_expr(cond[0]))]
Exemplo n.º 7
0
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 5)
    set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS)

    PostReplacementConsistencyEnforcer("project_id",
                                       ReplacerState.EVENTS).process_query(
                                           query, HTTPRequestSettings())

    expected = [
        ("project_id", "IN", [2]),
        (["assumeNotNull", ["group_id"]], "NOT IN", [100, 101, 102]),
    ]
    assert query.get_conditions() == expected
    assert query.get_condition_from_ast() == FunctionCall(
        None,
        BooleanFunctions.AND,
        (
            FunctionCall(
                None,
                "notIn",
                (
                    FunctionCall(None, "assumeNotNull",
                                 (Column(None, None, "group_id"), )),
                    FunctionCall(
                        None,
                        "tuple",
                        (
                            Literal(None, 100),
                            Literal(None, 101),
                            Literal(None, 102),
                        ),
                    ),
                ),
            ),
            build_in("project_id", [2]),
        ),
    )
    assert not query.get_final()
Exemplo n.º 8
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        conditions = query.get_conditions()
        if not conditions:
            return

        # Enable the processor only if we have enough data in the flattened
        # columns. Which have been deployed at BEGINNING_OF_TIME. If the query
        # starts earlier than that we do not apply the optimization.
        if self.__beginning_of_time:
            apply_optimization = False
            for condition in conditions:
                if (is_condition(condition) and isinstance(condition[0], str)
                        and condition[0] in self.__timestamp_cols
                        and condition[1] in (">=", ">")
                        and isinstance(condition[2], str)):
                    try:
                        start_ts = parse_datetime(condition[2])
                        if (start_ts -
                                self.__beginning_of_time).total_seconds() > 0:
                            apply_optimization = True
                    except Exception:
                        # We should not get here, it means the from timestamp is malformed
                        # Returning here is just for safety
                        logger.error(
                            "Cannot parse start date for NestedFieldOptimizer: %r",
                            condition,
                        )
                        return
            if not apply_optimization:
                return

        # Do not use flattened tags if tags are being unpacked anyway. In that case
        # using flattened tags only implies loading an additional column thus making
        # the query heavier and slower
        if self.__has_tags(query.get_arrayjoin_from_ast()):
            return
        if query.get_groupby_from_ast():
            for expression in query.get_groupby_from_ast():
                if self.__has_tags(expression):
                    return
        if self.__has_tags(query.get_having_from_ast()):
            return

        if query.get_orderby_from_ast():
            for orderby in query.get_orderby_from_ast():
                if self.__has_tags(orderby.expression):
                    return

        new_conditions = []
        positive_like_expression: List[str] = []
        negative_like_expression: List[str] = []

        for c in conditions:
            keyvalue = self.__is_optimizable(c, self.__nested_col)
            if not keyvalue:
                new_conditions.append(c)
            else:
                expression = f"{escape_field(keyvalue.nested_col_key)}={escape_field(keyvalue.value)}"
                if keyvalue.operand == Operand.EQ:
                    positive_like_expression.append(expression)
                else:
                    negative_like_expression.append(expression)

        if positive_like_expression:
            # Positive conditions "=" are all merged together in one LIKE expression
            positive_like_expression = sorted(positive_like_expression)
            like_formatted = f"%|{'|%|'.join(positive_like_expression)}|%"
            new_conditions.append(
                [self.__flattened_col, "LIKE", like_formatted])

        for expression in negative_like_expression:
            # Negative conditions "!=" cannot be merged together. We can still transform
            # them into NOT LIKE statements, but each condition has to be one
            # statement.
            not_like_formatted = f"%|{expression}|%"
            new_conditions.append(
                [self.__flattened_col, "NOT LIKE", not_like_formatted])

        query.set_conditions(new_conditions)
Exemplo n.º 9
0
def get_project_ids_in_query(query: Query,
                             project_column: str) -> Optional[Set[int]]:
    """
    Finds the project ids this query is filtering according to the legacy query
    representation.

    It returns a set of referenced project_ids if relevant conditions are found or
    None if no project_id condition is found in the query. Empty set means that multiple
    conflicting project id conditions were found.

    This function looks into first level AND conditions and second level OR conditions
    but it cannot support project ids used as function parameters.
    Specific limitations:
    - If a project_id is a parameter of a function that returns the project_id itself
      it is not supported. It would be very hard to support every function without a
      whitelist/blacklist of allowed functions in Snuba queries.
    - boolean functions are not supported. So we do not unpack and/or/not conditions
      expressed as functions. We will be able to do that with the AST.
    - does not exclude projects referenced in NOT conditions.

    We are going to try to lift as many of these limitations as possible. So, please,
    do not rely on them for the correctness of your code.
    """
    def find_project_id_sets(
        conditions: Sequence[Condition], ) -> Sequence[Set[int]]:
        """
        Scans a potentially nested sequence of conditions.
        For each simple condition adds to the output the set of project ids referenced
        by the condition.
        For each nested condition, it assumes it is a union of simple conditions
        (which is the only supported valid case by the Query object) and adds the union of the
        referenced project ids to the output.
        """
        project_id_sets: List[Set[int]] = list()
        for c in conditions:
            if is_condition(c):
                # This is a simple condition. Can extract the project ids directly.
                # Supports these kinds of conditions
                # ["col", "=", 1]
                # ["col", "IN", [1,2,3]]
                # ["col", "IN", (1,2,3)]
                if c[0] == project_column:
                    if c[1] == "=" and isinstance(c[2], int):
                        project_id_sets.append({c[2]})
                    elif c[1] == "IN" and all(
                            isinstance(project, int) for project in c[2]):
                        project_id_sets.append(set(c[2]))

            elif all(is_condition(second_level) for second_level in c):
                # This is supposed to be a union of simple conditions. Need to union
                # the sets of project ids.
                sets_to_unite = find_project_id_sets(c)
                if sets_to_unite:
                    project_id_sets.append(
                        reduce(lambda x, y: x | y, sets_to_unite))
            else:
                raise ValueError(f"Invalid condition {conditions}")

        return project_id_sets

    all_project_id_sets = find_project_id_sets(query.get_conditions() or [])

    if not all_project_id_sets:
        return None
    return reduce(lambda x, y: x & y, all_project_id_sets)
Exemplo n.º 10
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        limit = query.get_limit()
        if limit is None or query.get_groupby():
            return None

        if query.get_offset() >= 1000:
            return None

        orderby = query.get_orderby()
        if not orderby or orderby[0] != f"-{self.__timestamp_col}":
            return None

        conditions = query.get_conditions() or []
        from_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, ">=")),
            None,
        )

        to_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, "<")),
            None,
        )
        from_date_ast, to_date_ast = get_time_range(query,
                                                    self.__timestamp_col)

        if not from_date_str or not to_date_str:
            return None

        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )
        to_date = util.parse_datetime(to_date_str, date_align)
        from_date = util.parse_datetime(from_date_str, date_align)

        if from_date != from_date_ast:
            logger.warning(
                "Mismatch in start date on time splitter.",
                extra={
                    "ast": str(from_date_ast),
                    "legacy": str(from_date)
                },
                exc_info=True,
            )
            metrics.increment("mismatch.ast_from_date")

        remaining_offset = query.get_offset()

        overall_result = None
        split_end = to_date
        split_start = max(split_end - timedelta(seconds=split_step), from_date)
        total_results = 0
        while split_start < split_end and total_results < limit:
            # We need to make a copy to use during the query execution because we replace
            # the start-end conditions on the query at each iteration of this loop.
            split_query = copy.deepcopy(query)

            _replace_condition(split_query, self.__timestamp_col, ">=",
                               split_start.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, ">=",
                                   LiteralExpr(None, split_start))
            _replace_condition(split_query, self.__timestamp_col, "<",
                               split_end.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, "<",
                                   LiteralExpr(None, split_end))

            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            split_query.set_offset(0)
            split_query.set_limit(limit - total_results + remaining_offset)

            # At every iteration we only append the "data" key from the results returned by
            # the runner. The "extra" key is only populated at the first iteration of the
            # loop and never changed.
            result = runner(split_query, request_settings)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step), from_date)
                except OverflowError:
                    split_start = from_date

        return overall_result