Пример #1
0
def execute_query_with_rate_limits(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    # XXX: We should consider moving this that it applies to the logical query,
    # not the physical query.
    with RateLimitAggregator(request_settings.get_rate_limit_params()
                             ) as rate_limit_stats_container:
        stats.update(rate_limit_stats_container.to_dict())
        timer.mark("rate_limit")

        project_rate_limit_stats = rate_limit_stats_container.get_stats(
            PROJECT_RATE_LIMIT_NAME)

        if ("max_threads" in query_settings
                and project_rate_limit_stats is not None
                and project_rate_limit_stats.concurrent > 1):
            maxt = query_settings["max_threads"]
            query_settings["max_threads"] = max(
                1, maxt - project_rate_limit_stats.concurrent + 1)

        return execute_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            stats,
            query_settings,
        )
Пример #2
0
def test_timer_send_metrics() -> None:
    backend = TestingMetricsBackend()

    time = TestingClock()
    set_tags = {"foo": "bar", "blue": "car"}
    t = Timer("timer", clock=time, tags=set_tags)
    time.sleep(10)
    t.mark("thing1")
    time.sleep(10)
    t.mark("thing2")
    t.send_metrics_to(
        backend,
        tags={"key": "value"},
        mark_tags={"mark-key": "mark-value", "blue": "dog"},
    )

    overridden_tags = {"foo": "bar", "blue": "dog"}
    assert backend.calls == [
        Timing("timer", (10.0 + 10.0) * 1000, {"key": "value", **set_tags}),
        Timing(
            "timer.thing1", 10.0 * 1000, {"mark-key": "mark-value", **overridden_tags}
        ),
        Timing(
            "timer.thing2", 10.0 * 1000, {"mark-key": "mark-value", **overridden_tags}
        ),
    ]
Пример #3
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    referrer: str,
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    reader: Reader,
    robust: bool,
    concurrent_queries_gauge: Optional[Gauge] = None,
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """
    from_clause = clickhouse_query.get_from_clause()
    visitor = TablesCollector()
    visitor.visit(from_clause)
    table_names = ",".join(sorted(visitor.get_tables()))
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        _apply_turbo_sampling_if_needed(clickhouse_query, request_settings)

        formatted_query = format_query(clickhouse_query)
        span.set_data("query", formatted_query.structured())
        span.set_data("query_size_bytes",
                      _string_size_in_bytes(formatted_query.get_sql()))
        sentry_sdk.set_tag("query_size_group",
                           get_query_size_group(formatted_query.get_sql()))
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": table_names,
        "final": visitor.any_final(),
        "referrer": referrer,
        "sample": visitor.get_sample_rate(),
    }

    with sentry_sdk.start_span(description=formatted_query.get_sql(),
                               op="db") as span:
        span.set_tag("table", table_names)

        def execute() -> QueryResult:
            return raw_query(
                clickhouse_query,
                request_settings,
                formatted_query,
                reader,
                timer,
                query_metadata,
                stats,
                span.trace_id,
                robust=robust,
            )

        if concurrent_queries_gauge is not None:
            with concurrent_queries_gauge:
                return execute()
        else:
            return execute()
Пример #4
0
def execute_query_with_rate_limits(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    clickhouse_query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    # Global rate limiter is added at the end of the chain to be
    # the last for evaluation.
    # This allows us not to borrow capacity from the global quota
    # during the evaluation if one of the more specific limiters
    # (like the project rate limiter) rejects the query first.
    query_settings.add_rate_limit(get_global_rate_limit_params())
    # XXX: We should consider moving this that it applies to the logical query,
    # not the physical query.
    with RateLimitAggregator(
        query_settings.get_rate_limit_params()
    ) as rate_limit_stats_container:
        stats.update(rate_limit_stats_container.to_dict())
        timer.mark("rate_limit")

        project_rate_limit_stats = rate_limit_stats_container.get_stats(
            PROJECT_RATE_LIMIT_NAME
        )

        thread_quota = query_settings.get_resource_quota()
        if (
            ("max_threads" in clickhouse_query_settings or thread_quota is not None)
            and project_rate_limit_stats is not None
            and project_rate_limit_stats.concurrent > 1
        ):
            maxt = (
                clickhouse_query_settings["max_threads"]
                if thread_quota is None
                else thread_quota.max_threads
            )
            clickhouse_query_settings["max_threads"] = max(
                1, maxt - project_rate_limit_stats.concurrent + 1
            )

        _record_rate_limit_metrics(rate_limit_stats_container, reader, stats)

        return execute_query(
            clickhouse_query,
            query_settings,
            formatted_query,
            reader,
            timer,
            stats,
            clickhouse_query_settings,
            robust=robust,
        )
Пример #5
0
def execute_query_with_caching(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    clickhouse_query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    # XXX: ``uncompressed_cache_max_cols`` is used to control both the result
    # cache, as well as the uncompressed cache. These should be independent.
    use_cache, uc_max = state.get_configs(
        [("use_cache", settings.USE_RESULT_CACHE), ("uncompressed_cache_max_cols", 5)]
    )

    column_counter = ReferencedColumnsCounter()
    column_counter.visit(clickhouse_query.get_from_clause())
    assert isinstance(uc_max, int)
    if column_counter.count_columns() > uc_max:
        use_cache = False

    execute = partial(
        execute_query_with_rate_limits,
        clickhouse_query,
        query_settings,
        formatted_query,
        reader,
        timer,
        stats,
        clickhouse_query_settings,
        robust=robust,
    )

    with sentry_sdk.start_span(description="execute", op="db") as span:
        key = get_query_cache_key(formatted_query)
        clickhouse_query_settings["query_id"] = key
        if use_cache:
            cache_partition = _get_cache_partition(reader)
            result = cache_partition.get(key)
            timer.mark("cache_get")
            stats["cache_hit"] = result is not None
            if result is not None:
                span.set_tag("cache", "hit")
                return result

            span.set_tag("cache", "miss")
            result = execute()
            cache_partition.set(key, result)
            timer.mark("cache_set")
            return result
        else:
            return execute()
Пример #6
0
def validate_request_content(body, schema: RequestSchema, timer: Timer,
                             dataset: Dataset, referrer: str) -> Request:
    with sentry_sdk.start_span(description="validate_request_content",
                               op="validate") as span:
        try:
            request = schema.validate(body, dataset, referrer)
            span.set_data("snuba_query", request.body)
        except jsonschema.ValidationError as error:
            raise BadRequest(str(error)) from error

        timer.mark("validate_schema")

    return request
Пример #7
0
def build_request(
    body: MutableMapping[str, Any],
    parser: Parser,
    settings_class: Union[Type[HTTPRequestSettings], Type[SubscriptionRequestSettings]],
    schema: RequestSchema,
    dataset: Dataset,
    timer: Timer,
    referrer: str,
) -> Request:
    with sentry_sdk.start_span(description="build_request", op="validate") as span:
        try:
            request_parts = schema.validate(body)
            if settings_class == HTTPRequestSettings:
                settings = {
                    **request_parts.settings,
                    "consistent": _consistent_override(
                        request_parts.settings.get("consistent", False), referrer
                    ),
                }
                settings_obj: Union[
                    HTTPRequestSettings, SubscriptionRequestSettings
                ] = settings_class(**settings)
            elif settings_class == SubscriptionRequestSettings:
                settings_obj = settings_class(
                    consistent=_consistent_override(True, referrer)
                )

            query = parser(request_parts, settings_obj, dataset)

            request_id = uuid.uuid4().hex
            request = Request(
                request_id,
                # TODO: Replace this with the actual query raw body.
                # this can have an impact on subscriptions so we need
                # to be careful with the change.
                ChainMap(request_parts.query, *request_parts.extensions.values()),
                query,
                settings_obj,
                referrer,
            )
        except (InvalidJsonRequestException, InvalidQueryException) as exception:
            record_invalid_request(timer, referrer)
            raise exception
        except Exception as exception:
            record_error_building_request(timer, referrer)
            raise exception

        span.set_data("snuba_query", request.body)

        timer.mark("validate_schema")
        return request
Пример #8
0
def execute_query_with_caching(
    clickhouse_query: Query,
    request_settings: RequestSettings,
    formatted_query: SqlQuery,
    reader: Reader[SqlQuery],
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    # XXX: ``uncompressed_cache_max_cols`` is used to control both the result
    # cache, as well as the uncompressed cache. These should be independent.
    use_cache, uc_max = state.get_configs([("use_cache",
                                            settings.USE_RESULT_CACHE),
                                           ("uncompressed_cache_max_cols", 5)])

    if (len(
            set((
                # Skip aliases when counting columns
                (c.table_name, c.column_name)
                for c in clickhouse_query.get_all_ast_referenced_columns()))) >
            uc_max):
        use_cache = False

    execute = partial(
        execute_query_with_rate_limits,
        clickhouse_query,
        request_settings,
        formatted_query,
        reader,
        timer,
        stats,
        query_settings,
    )

    with sentry_sdk.start_span(description="execute", op="db") as span:
        if use_cache:
            key = get_query_cache_key(formatted_query)
            result = cache.get(key)
            timer.mark("cache_get")
            stats["cache_hit"] = result is not None
            if result is not None:
                span.set_tag("cache", "hit")
                return result

            span.set_tag("cache", "miss")
            result = execute()
            cache.set(key, result)
            timer.mark("cache_set")
            return result
        else:
            return execute()
Пример #9
0
def test_timer() -> None:
    time = TestingClock()

    t = Timer("timer", clock=time)
    time.sleep(10.0)
    t.mark("thing1")
    time.sleep(10.0)
    t.mark("thing2")
    assert t.finish() == {
        "timestamp": 0.0,
        "duration_ms": (10.0 + 10.0) * 1000,
        "marks_ms": {"thing1": 10.0 * 1000, "thing2": 10.0 * 1000},
        "tags": {},
    }
    assert t.get_duration_group() == ">20s"

    # Test that we can add more time under the same marks and the time will
    # be cumulatively added under those keys.
    time.sleep(10.0)
    t.mark("thing1")
    time.sleep(10.0)
    t.mark("thing2")
    assert t.finish() == {
        "timestamp": 0.0,
        "duration_ms": (10.0 + 10.0) * 2 * 1000,
        "marks_ms": {"thing1": 10.0 * 2 * 1000, "thing2": 10.0 * 2 * 1000},
        "tags": {},
    }
    assert t.get_duration_group() == ">30s"
Пример #10
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    referrer: str,
    clickhouse_query: Query,
    request_settings: RequestSettings,
    reader: Reader[SqlQuery],
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """

    # TODO: This function (well, it will be a wrapper of this function)
    # where we will transform the result according to the SelectedExpression
    # object in the query to ensure the fields in the QueryResult have
    # the same name the user expects.

    source = clickhouse_query.get_data_source().format_from()
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        formatted_query = AstSqlQuery(clickhouse_query, request_settings)
        span.set_data("query", formatted_query.sql_data())
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": source,
        "final": clickhouse_query.get_final(),
        "referrer": referrer,
        "num_days": (to_date - from_date).days,
        "sample": clickhouse_query.get_sample(),
    }

    with sentry_sdk.start_span(
        description=formatted_query.format_sql(), op="db"
    ) as span:
        span.set_tag("table", source)

        return raw_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            query_metadata,
            stats,
            span.trace_id,
        )
Пример #11
0
def execute_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    """
    Execute a query and return a result.
    """
    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in ClickHouse.
    uc_max = state.get_config("uncompressed_cache_max_cols", 5)
    assert isinstance(uc_max, int)
    column_counter = ReferencedColumnsCounter()
    column_counter.visit(clickhouse_query.get_from_clause())
    if column_counter.count_columns() > uc_max:
        query_settings["use_uncompressed_cache"] = 0

    # Force query to use the first shard replica, which
    # should have synchronously received any cluster writes
    # before this query is run.
    consistent = request_settings.get_consistent()
    stats["consistent"] = consistent
    if consistent:
        query_settings["load_balancing"] = "in_order"
        query_settings["max_threads"] = 1

    result = reader.execute(
        formatted_query,
        query_settings,
        with_totals=clickhouse_query.has_totals(),
        robust=robust,
    )

    timer.mark("execute")
    stats.update({
        "result_rows": len(result["data"]),
        "result_cols": len(result["meta"])
    })

    return result
Пример #12
0
def execute_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Query,
    request_settings: RequestSettings,
    formatted_query: SqlQuery,
    reader: Reader[SqlQuery],
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    """
    Execute a query and return a result.
    """
    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in ClickHouse.
    uc_max = state.get_config("uncompressed_cache_max_cols", 5)
    if (len(
            set((
                # Skip aliases when counting columns
                (c.table_name, c.column_name)
                for c in clickhouse_query.get_all_ast_referenced_columns()))) >
            uc_max):
        query_settings["use_uncompressed_cache"] = 0

    # Force query to use the first shard replica, which
    # should have synchronously received any cluster writes
    # before this query is run.
    consistent = request_settings.get_consistent()
    stats["consistent"] = consistent
    if consistent:
        query_settings["load_balancing"] = "in_order"
        query_settings["max_threads"] = 1

    result = reader.execute(
        formatted_query,
        query_settings,
        with_totals=clickhouse_query.has_totals(),
    )

    timer.mark("execute")
    stats.update({
        "result_rows": len(result["data"]),
        "result_cols": len(result["meta"])
    })

    return result
Пример #13
0
def _format_storage_query_and_run(
    # TODO: remove dependency on Dataset. This is only for formatting the legacy ClickhouseQuery
    # with the AST this won't be needed.
    dataset: Dataset,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    request: Request,
) -> RawQueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    TODO: When we will have the AST in production and we will have the StorageQuery
    abstraction, this function is probably going to collapse and disappear.
    """

    source = request.query.get_data_source().format_from()
    with sentry_sdk.start_span(description="create_query", op="db"):
        # TODO: Move the performance logic and the pre_where generation into
        # ClickhouseQuery since they are Clickhouse specific
        query = DictClickhouseQuery(dataset, request.query, request.settings)
    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": source,
        "final": request.query.get_final(),
        "referrer": request.referrer,
        "num_days": (to_date - from_date).days,
        "sample": request.query.get_sample(),
    }

    with sentry_sdk.start_span(description=query.format_sql(),
                               op="db") as span:
        span.set_tag("table", source)
        try:
            span.set_tag(
                "ast_query",
                AstClickhouseQuery(request.query,
                                   request.settings).format_sql(),
            )
        except Exception:
            logger.warning("Failed to format ast query", exc_info=True)

        return raw_query(request, query, timer, query_metadata, stats,
                         span.trace_id)
Пример #14
0
def build_request(body, schema: RequestSchema, timer: Timer, dataset: Dataset,
                  referrer: str) -> Request:
    with sentry_sdk.start_span(description="build_request",
                               op="validate") as span:
        try:
            request = schema.validate(body, dataset, referrer)
        except (InvalidJsonRequestException,
                InvalidQueryException) as exception:
            record_invalid_request(timer, referrer)
            raise exception
        except Exception as exception:
            record_error_building_request(timer, referrer)
            raise exception

        span.set_data("snuba_query", request.body)

        timer.mark("validate_schema")
        return request
Пример #15
0
def test_timer_send_metrics() -> None:
    backend = TestingMetricsBackend()

    time = TestingClock()

    t = Timer("timer", clock=time)
    time.sleep(10)
    t.mark("thing1")
    time.sleep(10)
    t.mark("thing2")
    t.send_metrics_to(backend,
                      tags={"key": "value"},
                      mark_tags={"mark-key": "mark-value"})

    assert backend.calls == [
        Timing("timer", (10.0 + 10.0) * 1000, {"key": "value"}),
        Timing("timer.thing1", 10.0 * 1000, {"mark-key": "mark-value"}),
        Timing("timer.thing2", 10.0 * 1000, {"mark-key": "mark-value"}),
    ]
Пример #16
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    referrer: str,
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    reader: Reader,
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """
    from_clause = clickhouse_query.get_from_clause()
    visitor = TablesCollector()
    visitor.visit(from_clause)
    table_names = ",".join(sorted(visitor.get_tables()))
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        formatted_query = format_query(clickhouse_query, request_settings)
        span.set_data("query", formatted_query.structured())
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": table_names,
        "final": visitor.any_final(),
        "referrer": referrer,
        "sample": visitor.get_sample_rate(),
    }

    with sentry_sdk.start_span(description=formatted_query.get_sql(),
                               op="db") as span:
        span.set_tag("table", table_names)

        return raw_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            query_metadata,
            stats,
            span.trace_id,
        )
Пример #17
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    referrer: str,
    clickhouse_query: Query,
    request_settings: RequestSettings,
    reader: Reader[SqlQuery],
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """
    source = clickhouse_query.get_from_clause().format_from()
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        formatted_query = AstSqlQuery(clickhouse_query, request_settings)
        span.set_data("query", formatted_query.sql_data())
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": source,
        "final": clickhouse_query.get_final(),
        "referrer": referrer,
        "num_days": (to_date - from_date).days,
        "sample": clickhouse_query.get_sample(),
    }

    with sentry_sdk.start_span(description=formatted_query.format_sql(),
                               op="db") as span:
        span.set_tag("table", source)

        return raw_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            query_metadata,
            stats,
            span.trace_id,
        )
Пример #18
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    referrer: str,
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    reader: Reader,
    robust: bool,
    concurrent_queries_gauge: Optional[Gauge] = None,
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """
    from_clause = clickhouse_query.get_from_clause()
    visitor = TablesCollector()
    visitor.visit(from_clause)
    table_names = ",".join(sorted(visitor.get_tables()))
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        _apply_turbo_sampling_if_needed(clickhouse_query, query_settings)

        formatted_query = format_query(clickhouse_query)
        query_size_bytes = len(formatted_query.get_sql().encode("utf-8"))
        span.set_data("query", formatted_query.structured())
        span.set_data("query_size_bytes", query_size_bytes)
        sentry_sdk.set_tag("query_size_group",
                           get_query_size_group(query_size_bytes))
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": table_names,
        "final": visitor.any_final(),
        "referrer": referrer,
        "sample": visitor.get_sample_rate(),
    }

    if query_size_bytes > MAX_QUERY_SIZE_BYTES:
        raise QueryException(extra=QueryExtraData(
            stats=stats,
            sql=formatted_query.get_sql(),
            experiments=clickhouse_query.get_experiments(),
        )) from QueryTooLongException(
            f"After processing, query is {query_size_bytes} bytes, "
            "which is too long for ClickHouse to process. "
            f"Max size is {MAX_QUERY_SIZE_BYTES} bytes.")

    with sentry_sdk.start_span(description=formatted_query.get_sql(),
                               op="db") as span:
        span.set_tag("table", table_names)

        def execute() -> QueryResult:
            return raw_query(
                clickhouse_query,
                query_settings,
                formatted_query,
                reader,
                timer,
                query_metadata,
                stats,
                span.trace_id,
                robust=robust,
            )

        if concurrent_queries_gauge is not None:
            with concurrent_queries_gauge:
                return execute()
        else:
            return execute()
Пример #19
0
def raw_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    stats: MutableMapping[str, Any],
    trace_id: Optional[str] = None,
) -> QueryResult:
    """
    Submits a raw SQL query to the DB and does some post-processing on it to
    fix some of the formatting issues in the result JSON.
    This function is not supposed to depend on anything higher level than the clickhouse
    query. If this function ends up depending on the dataset, something is wrong.
    """
    all_confs = state.get_all_configs()
    query_settings: MutableMapping[str, Any] = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items() if k.startswith("query_settings/")
    }

    timer.mark("get_configs")

    sql = formatted_query.get_sql()

    update_with_status = partial(
        update_query_metadata_and_stats,
        clickhouse_query,
        sql,
        timer,
        stats,
        query_metadata,
        query_settings,
        trace_id,
    )

    execute_query_strategy = (
        execute_query_with_readthrough_caching if state.get_config(
            "use_readthrough_query_cache", 1) else execute_query_with_caching)

    try:
        result = execute_query_strategy(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            stats,
            query_settings,
        )
    except Exception as cause:
        if isinstance(cause, RateLimitExceeded):
            stats = update_with_status(QueryStatus.RATE_LIMITED)
        else:
            with configure_scope() as scope:
                if isinstance(cause, ClickhouseError):
                    scope.fingerprint = ["{{default}}", str(cause.code)]
                logger.exception("Error running query: %s\n%s", sql, cause)
            stats = update_with_status(QueryStatus.ERROR)
        raise QueryException({"stats": stats, "sql": sql}) from cause
    else:
        stats = update_with_status(QueryStatus.SUCCESS)
        return QueryResult(result, {"stats": stats, "sql": sql})
Пример #20
0
def raw_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    stats: MutableMapping[str, Any],
    trace_id: Optional[str] = None,
    robust: bool = False,
) -> QueryResult:
    """
    Submits a raw SQL query to the DB and does some post-processing on it to
    fix some of the formatting issues in the result JSON.
    This function is not supposed to depend on anything higher level than the clickhouse
    query. If this function ends up depending on the dataset, something is wrong.
    """
    all_confs = state.get_all_configs()
    clickhouse_query_settings: MutableMapping[str, Any] = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items()
        if k.startswith("query_settings/")
    }

    timer.mark("get_configs")

    sql = formatted_query.get_sql()

    update_with_status = partial(
        update_query_metadata_and_stats,
        clickhouse_query,
        sql,
        timer,
        stats,
        query_metadata,
        clickhouse_query_settings,
        trace_id,
    )

    execute_query_strategy = (
        execute_query_with_readthrough_caching
        if state.get_config("use_readthrough_query_cache", 1)
        else execute_query_with_caching
    )

    try:
        result = execute_query_strategy(
            clickhouse_query,
            query_settings,
            formatted_query,
            reader,
            timer,
            stats,
            clickhouse_query_settings,
            robust=robust,
        )
    except Exception as cause:
        if isinstance(cause, RateLimitExceeded):
            stats = update_with_status(QueryStatus.RATE_LIMITED)
        else:
            error_code = None
            with configure_scope() as scope:
                if isinstance(cause, ClickhouseError):
                    error_code = cause.code
                    scope.fingerprint = ["{{default}}", str(cause.code)]
                    if scope.span:
                        if cause.code == errors.ErrorCodes.TOO_SLOW:
                            sentry_sdk.set_tag("timeout", "predicted")
                        elif cause.code == errors.ErrorCodes.TIMEOUT_EXCEEDED:
                            sentry_sdk.set_tag("timeout", "query_timeout")
                        elif cause.code in (
                            errors.ErrorCodes.SOCKET_TIMEOUT,
                            errors.ErrorCodes.NETWORK_ERROR,
                        ):
                            sentry_sdk.set_tag("timeout", "network")
                elif isinstance(
                    cause,
                    (TimeoutError, ExecutionTimeoutError, TigerExecutionTimeoutError),
                ):
                    if scope.span:
                        sentry_sdk.set_tag("timeout", "cache_timeout")

                logger.exception("Error running query: %s\n%s", sql, cause)
            stats = update_with_status(QueryStatus.ERROR, error_code=error_code)
        raise QueryException(
            {
                "stats": stats,
                "sql": sql,
                "experiments": clickhouse_query.get_experiments(),
            }
        ) from cause
    else:
        stats = update_with_status(QueryStatus.SUCCESS, result["profile"])
        return QueryResult(
            result,
            {
                "stats": stats,
                "sql": sql,
                "experiments": clickhouse_query.get_experiments(),
            },
        )
Пример #21
0
def raw_query(
    request: Request,
    query: ClickhouseQuery,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    stats: MutableMapping[str, Any],
    trace_id: Optional[str] = None,
) -> RawQueryResult:
    """
    Submits a raw SQL query to the DB and does some post-processing on it to
    fix some of the formatting issues in the result JSON.
    This function is not supposed to depend on anything higher level than the storage
    query (ClickhouseQuery as of now). If this function ends up depending on the
    dataset, something is wrong.

    TODO: As soon as we have a StorageQuery abstraction remove all the references
    to the original query from the request.
    """

    use_cache, use_deduper, uc_max = state.get_configs([
        ("use_cache", settings.USE_RESULT_CACHE),
        ("use_deduper", 1),
        ("uncompressed_cache_max_cols", 5),
    ])

    all_confs = state.get_all_configs()
    query_settings: MutableMapping[str, Any] = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items() if k.startswith("query_settings/")
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(request.query.get_all_referenced_columns()) > uc_max:
        query_settings["use_uncompressed_cache"] = 0
        use_cache = 0

    timer.mark("get_configs")

    sql = query.format_sql()
    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark("dedupe_wait")

        result = cache.get(query_id) if use_cache else None
        timer.mark("cache_get")

        stats.update({
            "is_duplicate": is_dupe,
            "query_id": query_id,
            "use_cache": bool(use_cache),
            "cache_hit": bool(result),
        }),

        update_with_status = partial(
            update_query_metadata_and_stats,
            request,
            sql,
            timer,
            stats,
            query_metadata,
            query_settings,
            trace_id,
        )

        if not result:
            try:
                with RateLimitAggregator(
                        request.settings.get_rate_limit_params(
                        )) as rate_limit_stats_container:
                    stats.update(rate_limit_stats_container.to_dict())
                    timer.mark("rate_limit")

                    project_rate_limit_stats = rate_limit_stats_container.get_stats(
                        PROJECT_RATE_LIMIT_NAME)

                    if ("max_threads" in query_settings
                            and project_rate_limit_stats is not None
                            and project_rate_limit_stats.concurrent > 1):
                        maxt = query_settings["max_threads"]
                        query_settings["max_threads"] = max(
                            1, maxt - project_rate_limit_stats.concurrent + 1)

                    # Force query to use the first shard replica, which
                    # should have synchronously received any cluster writes
                    # before this query is run.
                    consistent = request.settings.get_consistent()
                    stats["consistent"] = consistent
                    if consistent:
                        query_settings["load_balancing"] = "in_order"
                        query_settings["max_threads"] = 1

                    try:
                        result = reader.execute(
                            query,
                            query_settings,
                            # All queries should already be deduplicated at this point
                            # But the query_id will let us know if they aren't
                            query_id=query_id if use_deduper else None,
                            with_totals=request.query.has_totals(),
                        )

                        timer.mark("execute")
                        stats.update({
                            "result_rows": len(result["data"]),
                            "result_cols": len(result["meta"]),
                        })

                        if use_cache:
                            cache.set(query_id, result)
                            timer.mark("cache_set")

                    except BaseException as ex:
                        error = str(ex)
                        logger.exception("Error running query: %s\n%s", sql,
                                         error)
                        stats = update_with_status("error")
                        meta = {}
                        if isinstance(ex, ClickhouseError):
                            err_type = "clickhouse"
                            meta["code"] = ex.code
                        else:
                            err_type = "unknown"
                        raise RawQueryException(
                            err_type=err_type,
                            message=error,
                            stats=stats,
                            sql=sql,
                            **meta,
                        )
            except RateLimitExceeded as ex:
                stats = update_with_status("rate-limited")
                raise RawQueryException(
                    err_type="rate-limited",
                    message="rate limit exceeded",
                    stats=stats,
                    sql=sql,
                    detail=str(ex),
                )

    stats = update_with_status("success")

    return RawQueryResult(result, {"stats": stats, "sql": sql})
Пример #22
0
def raw_query(
    request: Request,
    query: DictClickhouseQuery,
    reader: Reader[ClickhouseQuery],
    timer: Timer,
    stats: Optional[MutableMapping[str, Any]] = None,
) -> ClickhouseQueryResult:
    """
    Submit a raw SQL query to clickhouse and do some post-processing on it to
    fix some of the formatting issues in the result JSON
    """

    stats = stats or {}
    use_cache, use_deduper, uc_max = state.get_configs(
        [("use_cache", 0), ("use_deduper", 1), ("uncompressed_cache_max_cols", 5)]
    )

    all_confs = state.get_all_configs()
    query_settings = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items()
        if k.startswith("query_settings/")
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(request.query.get_all_referenced_columns()) > uc_max:
        query_settings["use_uncompressed_cache"] = 0
        use_cache = 0

    timer.mark("get_configs")

    sql = query.format_sql()
    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark("dedupe_wait")

        result = cache.get(query_id) if use_cache else None
        timer.mark("cache_get")

        stats.update(
            {
                "is_duplicate": is_dupe,
                "query_id": query_id,
                "use_cache": bool(use_cache),
                "cache_hit": bool(result),
            }
        ),

        if not result:
            try:
                with RateLimitAggregator(
                    request.settings.get_rate_limit_params()
                ) as rate_limit_stats_container:
                    stats.update(rate_limit_stats_container.to_dict())
                    timer.mark("rate_limit")

                    project_rate_limit_stats = rate_limit_stats_container.get_stats(
                        PROJECT_RATE_LIMIT_NAME
                    )

                    if (
                        "max_threads" in query_settings
                        and project_rate_limit_stats is not None
                        and project_rate_limit_stats.concurrent > 1
                    ):
                        maxt = query_settings["max_threads"]
                        query_settings["max_threads"] = max(
                            1, maxt - project_rate_limit_stats.concurrent + 1
                        )

                    # Force query to use the first shard replica, which
                    # should have synchronously received any cluster writes
                    # before this query is run.
                    consistent = request.settings.get_consistent()
                    stats["consistent"] = consistent
                    if consistent:
                        query_settings["load_balancing"] = "in_order"
                        query_settings["max_threads"] = 1

                    try:
                        result = reader.execute(
                            query,
                            query_settings,
                            # All queries should already be deduplicated at this point
                            # But the query_id will let us know if they aren't
                            query_id=query_id if use_deduper else None,
                            with_totals=request.query.has_totals(),
                        )

                        timer.mark("execute")
                        stats.update(
                            {
                                "result_rows": len(result["data"]),
                                "result_cols": len(result["meta"]),
                            }
                        )

                        if use_cache:
                            cache.set(query_id, result)
                            timer.mark("cache_set")

                    except BaseException as ex:
                        error = str(ex)
                        logger.exception("Error running query: %s\n%s", sql, error)
                        stats = log_query_and_update_stats(
                            request, sql, timer, stats, "error", query_settings
                        )
                        meta = {}
                        if isinstance(ex, ClickHouseError):
                            err_type = "clickhouse"
                            meta["code"] = ex.code
                        else:
                            err_type = "unknown"
                        raise RawQueryException(
                            err_type=err_type,
                            message=error,
                            stats=stats,
                            sql=sql,
                            **meta,
                        )
            except RateLimitExceeded as ex:
                stats = log_query_and_update_stats(
                    request, sql, timer, stats, "rate-limited", query_settings
                )
                raise RawQueryException(
                    err_type="rate-limited",
                    message="rate limit exceeded",
                    stats=stats,
                    sql=sql,
                    detail=str(ex),
                )

    stats = log_query_and_update_stats(
        request, sql, timer, stats, "success", query_settings
    )

    if settings.STATS_IN_RESPONSE or request.settings.get_debug():
        result["stats"] = stats
        result["sql"] = sql

    return result
Пример #23
0
def parse_and_run_query(
    dataset: Dataset, request: Request, timer: Timer
) -> ClickhouseQueryResult:
    from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit(
        request.extensions["timeseries"]
    )

    if (
        request.query.get_sample() is not None and request.query.get_sample() != 1.0
    ) and not request.settings.get_turbo():
        metrics.increment("sample_without_turbo", tags={"referrer": request.referrer})

    extensions = dataset.get_extensions()
    for name, extension in extensions.items():
        extension.get_processor().process_query(
            request.query, request.extensions[name], request.settings
        )

    request.query.add_conditions(dataset.default_conditions())

    if request.settings.get_turbo():
        request.query.set_final(False)

    for processor in dataset.get_query_processors():
        processor.process_query(request.query, request.settings)

    relational_source = request.query.get_data_source()
    request.query.add_conditions(relational_source.get_mandatory_conditions())

    source = relational_source.format_from()
    with sentry_sdk.start_span(description="create_query", op="db"):
        # TODO: consider moving the performance logic and the pre_where generation into
        # ClickhouseQuery since they are Clickhouse specific
        query = DictClickhouseQuery(dataset, request.query, request.settings)
    timer.mark("prepare_query")

    num_days = (to_date - from_date).days
    stats = {
        "clickhouse_table": source,
        "final": request.query.get_final(),
        "referrer": request.referrer,
        "num_days": num_days,
        "sample": request.query.get_sample(),
    }

    with sentry_sdk.configure_scope() as scope:
        if scope.span:
            scope.span.set_tag("dataset", type(dataset).__name__)
            scope.span.set_tag("referrer", http_request.referrer)
            scope.span.set_tag("timeframe_days", num_days)

    with sentry_sdk.start_span(description=query.format_sql(), op="db") as span:
        span.set_tag("dataset", type(dataset).__name__)
        span.set_tag("table", source)
        try:
            span.set_tag(
                "ast_query",
                AstClickhouseQuery(request.query, request.settings).format_sql(),
            )
        except Exception:
            logger.exception("Failed to format ast query")
        result = raw_query(
            request, query, NativeDriverReader(clickhouse_ro), timer, stats
        )

    with sentry_sdk.configure_scope() as scope:
        if scope.span:
            if "max_threads" in stats:
                scope.span.set_tag("max_threads", stats["max_threads"])

    return result
Пример #24
0
def build_request(
    body: MutableMapping[str, Any],
    parser: Parser,
    settings_class: Union[Type[HTTPQuerySettings], Type[SubscriptionQuerySettings]],
    schema: RequestSchema,
    dataset: Dataset,
    timer: Timer,
    referrer: str,
    custom_processing: Optional[CustomProcessors] = None,
) -> Request:
    with sentry_sdk.start_span(description="build_request", op="validate") as span:
        try:
            request_parts = schema.validate(body)
            if settings_class == HTTPQuerySettings:
                query_settings: MutableMapping[str, bool | str] = {
                    **request_parts.query_settings,
                    "consistent": _consistent_override(
                        request_parts.query_settings.get("consistent", False), referrer
                    ),
                }
                query_settings["referrer"] = referrer
                # TODO: referrer probably doesn't need to be passed in, it should be from the body
                settings_obj: Union[
                    HTTPQuerySettings, SubscriptionQuerySettings
                ] = settings_class(
                    **query_settings,
                )
            elif settings_class == SubscriptionQuerySettings:
                settings_obj = settings_class(
                    consistent=_consistent_override(True, referrer),
                )
            query, snql_anonymized = parser(
                request_parts, settings_obj, dataset, custom_processing
            )

            project_ids = get_object_ids_in_query_ast(query, "project_id")
            if project_ids is not None and len(project_ids) == 1:
                sentry_sdk.set_tag("snuba_project_id", project_ids.pop())

            org_ids = get_object_ids_in_query_ast(query, "org_id")
            if org_ids is not None and len(org_ids) == 1:
                sentry_sdk.set_tag("snuba_org_id", org_ids.pop())
            attribution_info = dict(request_parts.attribution_info)
            # TODO: clean this up
            attribution_info["app_id"] = get_app_id(
                request_parts.attribution_info["app_id"]
            )
            attribution_info["referrer"] = referrer

            request_id = uuid.uuid4().hex
            request = Request(
                id=request_id,
                # TODO: Replace this with the actual query raw body.
                # this can have an impact on subscriptions so we need
                # to be careful with the change.
                original_body=body,
                query=query,
                attribution_info=AttributionInfo(**attribution_info),
                query_settings=settings_obj,
                snql_anonymized=snql_anonymized,
            )
        except (InvalidJsonRequestException, InvalidQueryException) as exception:
            record_invalid_request(timer, referrer)
            raise exception
        except Exception as exception:
            record_error_building_request(timer, referrer)
            raise exception

        span.set_data(
            "snuba_query_parsed",
            repr(query).split("\n"),
        )
        span.set_data(
            "snuba_query_raw",
            textwrap.wrap(repr(request.original_body), 100, break_long_words=False),
        )
        sentry_sdk.add_breadcrumb(
            category="query_info",
            level="info",
            message="snuba_query_raw",
            data={
                "query": textwrap.wrap(
                    repr(request.original_body), 100, break_long_words=False
                )
            },
        )

        timer.mark("validate_schema")
        return request