Exemplo n.º 1
0
def execute_query_with_caching(
    clickhouse_query: Query,
    request_settings: RequestSettings,
    formatted_query: SqlQuery,
    reader: Reader[SqlQuery],
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    # XXX: ``uncompressed_cache_max_cols`` is used to control both the result
    # cache, as well as the uncompressed cache. These should be independent.
    use_cache, uc_max = state.get_configs([("use_cache",
                                            settings.USE_RESULT_CACHE),
                                           ("uncompressed_cache_max_cols", 5)])

    if (len(
            set((
                # Skip aliases when counting columns
                (c.table_name, c.column_name)
                for c in clickhouse_query.get_all_ast_referenced_columns()))) >
            uc_max):
        use_cache = False

    execute = partial(
        execute_query_with_rate_limits,
        clickhouse_query,
        request_settings,
        formatted_query,
        reader,
        timer,
        stats,
        query_settings,
    )

    with sentry_sdk.start_span(description="execute", op="db") as span:
        if use_cache:
            key = get_query_cache_key(formatted_query)
            result = cache.get(key)
            timer.mark("cache_get")
            stats["cache_hit"] = result is not None
            if result is not None:
                span.set_tag("cache", "hit")
                return result

            span.set_tag("cache", "miss")
            result = execute()
            cache.set(key, result)
            timer.mark("cache_set")
            return result
        else:
            return execute()
Exemplo n.º 2
0
def execute_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Query,
    request_settings: RequestSettings,
    formatted_query: SqlQuery,
    reader: Reader[SqlQuery],
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    """
    Execute a query and return a result.
    """
    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in ClickHouse.
    uc_max = state.get_config("uncompressed_cache_max_cols", 5)
    if (len(
            set((
                # Skip aliases when counting columns
                (c.table_name, c.column_name)
                for c in clickhouse_query.get_all_ast_referenced_columns()))) >
            uc_max):
        query_settings["use_uncompressed_cache"] = 0

    # Force query to use the first shard replica, which
    # should have synchronously received any cluster writes
    # before this query is run.
    consistent = request_settings.get_consistent()
    stats["consistent"] = consistent
    if consistent:
        query_settings["load_balancing"] = "in_order"
        query_settings["max_threads"] = 1

    result = reader.execute(
        formatted_query,
        query_settings,
        with_totals=clickhouse_query.has_totals(),
    )

    timer.mark("execute")
    stats.update({
        "result_rows": len(result["data"]),
        "result_cols": len(result["meta"])
    })

    return result
Exemplo n.º 3
0
def _get_all_columns(query: Query) -> Columnset:
    return {c.column_name for c in query.get_all_ast_referenced_columns()}
Exemplo n.º 4
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)