Exemplo n.º 1
0
def _run_query_pipeline(
    dataset: Dataset,
    request: Request,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
) -> RawQueryResult:
    """
    Runs the query processing and execution pipeline for a Snuba Query. This means it takes a Dataset
    and a Request and returns the results of the query.

    This process includes:
    - Applying dataset specific syntax extensions (QueryExtension)
    - Applying dataset query processors on the abstract Snuba query.
    - Using the dataset provided StorageQueryPlanBuilder to build a StorageQueryPlan. This step
      transforms the Snuba Query into the Storage Query (that is contextual to the storage/s).
      From this point on none should depend on the dataset.
    - Executing the storage specific query processors.
    - Providing the newly built Query and a QueryRunner to the QueryExecutionStrategy to actually
      run the DB Query.
    """

    # TODO: this will work perfectly with datasets that are not time series. Remove it.
    from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit(
        request.extensions["timeseries"])

    if (request.query.get_sample() is not None and request.query.get_sample()
            != 1.0) and not request.settings.get_turbo():
        metrics.increment("sample_without_turbo",
                          tags={"referrer": request.referrer})

    extensions = dataset.get_extensions()
    for name, extension in extensions.items():
        extension.get_processor().process_query(request.query,
                                                request.extensions[name],
                                                request.settings)

    # TODO: Fit this in a query processor. All query transformations should be driven by
    # datasets/storages and never hardcoded.
    if request.settings.get_turbo():
        request.query.set_final(False)

    for processor in dataset.get_query_processors():
        processor.process_query(request.query, request.settings)

    storage_query_plan = dataset.get_query_plan_builder().build_plan(request)

    # TODO: This below should be a storage specific query processor.
    relational_source = request.query.get_data_source()
    request.query.add_conditions(relational_source.get_mandatory_conditions())

    for processor in storage_query_plan.query_processors:
        processor.process_query(request.query, request.settings)

    query_runner = partial(
        _format_storage_query_and_run,
        dataset,
        timer,
        query_metadata,
        from_date,
        to_date,
    )

    return storage_query_plan.execution_strategy.execute(request, query_runner)
Exemplo n.º 2
0
def parse_and_run_query(
    dataset: Dataset, request: Request, timer: Timer
) -> ClickhouseQueryResult:
    from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit(
        request.extensions["timeseries"]
    )

    if (
        request.query.get_sample() is not None and request.query.get_sample() != 1.0
    ) and not request.settings.get_turbo():
        metrics.increment("sample_without_turbo", tags={"referrer": request.referrer})

    extensions = dataset.get_extensions()
    for name, extension in extensions.items():
        extension.get_processor().process_query(
            request.query, request.extensions[name], request.settings
        )

    request.query.add_conditions(dataset.default_conditions())

    if request.settings.get_turbo():
        request.query.set_final(False)

    for processor in dataset.get_query_processors():
        processor.process_query(request.query, request.settings)

    relational_source = request.query.get_data_source()
    request.query.add_conditions(relational_source.get_mandatory_conditions())

    source = relational_source.format_from()
    with sentry_sdk.start_span(description="create_query", op="db"):
        # TODO: consider moving the performance logic and the pre_where generation into
        # ClickhouseQuery since they are Clickhouse specific
        query = DictClickhouseQuery(dataset, request.query, request.settings)
    timer.mark("prepare_query")

    num_days = (to_date - from_date).days
    stats = {
        "clickhouse_table": source,
        "final": request.query.get_final(),
        "referrer": request.referrer,
        "num_days": num_days,
        "sample": request.query.get_sample(),
    }

    with sentry_sdk.configure_scope() as scope:
        if scope.span:
            scope.span.set_tag("dataset", type(dataset).__name__)
            scope.span.set_tag("referrer", http_request.referrer)
            scope.span.set_tag("timeframe_days", num_days)

    with sentry_sdk.start_span(description=query.format_sql(), op="db") as span:
        span.set_tag("dataset", type(dataset).__name__)
        span.set_tag("table", source)
        try:
            span.set_tag(
                "ast_query",
                AstClickhouseQuery(request.query, request.settings).format_sql(),
            )
        except Exception:
            logger.exception("Failed to format ast query")
        result = raw_query(
            request, query, NativeDriverReader(clickhouse_ro), timer, stats
        )

    with sentry_sdk.configure_scope() as scope:
        if scope.span:
            if "max_threads" in stats:
                scope.span.set_tag("max_threads", stats["max_threads"])

    return result
Exemplo n.º 3
0
def _run_query_pipeline(
    dataset: Dataset,
    request: Request,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
) -> QueryResult:
    """
    Runs the query processing and execution pipeline for a Snuba Query. This means it takes a Dataset
    and a Request and returns the results of the query.

    This process includes:
    - Applying dataset specific syntax extensions (QueryExtension)
    - Applying dataset query processors on the abstract Snuba query.
    - Using the dataset provided ClickhouseQueryPlanBuilder to build a ClickhouseQueryPlan. This step
      transforms the Snuba Query into the Storage Query (that is contextual to the storage/s).
      From this point on none should depend on the dataset.
    - Executing the plan specific query processors.
    - Providing the newly built Query, processors to be run for each DB query and a QueryRunner
      to the QueryExecutionStrategy to actually run the DB Query.
    """

    # TODO: this will work perfectly with datasets that are not time series. Remove it.
    from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit(
        request.extensions["timeseries"]
    )

    if (
        request.query.get_sample() is not None and request.query.get_sample() != 1.0
    ) and not request.settings.get_turbo():
        metrics.increment("sample_without_turbo", tags={"referrer": request.referrer})

    extensions = dataset.get_extensions()
    for name, extension in extensions.items():
        with sentry_sdk.start_span(
            description=type(extension.get_processor()).__name__, op="extension"
        ):
            extension.get_processor().process_query(
                request.query, request.extensions[name], request.settings
            )

    # TODO: Fit this in a query processor. All query transformations should be driven by
    # datasets/storages and never hardcoded.
    if request.settings.get_turbo():
        request.query.set_final(False)

    for processor in dataset.get_query_processors():
        with sentry_sdk.start_span(
            description=type(processor).__name__, op="processor"
        ):
            processor.process_query(request.query, request.settings)

    query_plan = dataset.get_query_plan_builder().build_plan(request)
    # From this point on. The logical query should not be used anymore by anyone.
    # The Clickhouse Query is the one to be used to run the rest of the query pipeline.

    # TODO: Break the Query Plan execution out of this method. With the division
    # between plan specific processors and DB query specific processors and with
    # the soon to come ClickhouseCluster, there is more coupling between the
    # components of the query plan.

    for clickhouse_processor in query_plan.plan_processors:
        with sentry_sdk.start_span(
            description=type(clickhouse_processor).__name__, op="processor"
        ):
            clickhouse_processor.process_query(query_plan.query, request.settings)

    query_runner = partial(
        _format_storage_query_and_run,
        timer,
        query_metadata,
        from_date,
        to_date,
        request.referrer,
    )

    return query_plan.execution_strategy.execute(
        query_plan.query, request.settings, query_runner
    )