예제 #1
0
def test_project_extension_query_adds_rate_limits():
    extension = ProjectExtension(processor=ProjectExtensionProcessor(
        project_column="project_id"))
    raw_data = {'project': [2, 3]}
    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query(
        {"conditions": []},
        TableSource("my_table", ColumnSet([])),
    )
    request_settings = RequestSettings(turbo=False,
                                       consistent=False,
                                       debug=False)

    num_rate_limits_before_processing = len(
        request_settings.get_rate_limit_params())
    extension.get_processor().process_query(query, valid_data,
                                            request_settings)

    rate_limits = request_settings.get_rate_limit_params()
    # make sure a rate limit was added by the processing
    assert len(rate_limits) == num_rate_limits_before_processing + 1

    most_recent_rate_limit = rate_limits[-1]
    assert most_recent_rate_limit.bucket == '2'
    assert most_recent_rate_limit.per_second_limit == 1000
    assert most_recent_rate_limit.concurrent_limit == 1000
예제 #2
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        # If the settings don't already have a project rate limit, add one
        existing = request_settings.get_rate_limit_params()
        for ex in existing:
            if ex.rate_limit_name == PROJECT_RATE_LIMIT_NAME:
                return

        project_ids = get_project_ids_in_query_ast(query, self.project_column)
        if not project_ids:
            return

        # TODO: Use all the projects, not just one
        project_id = project_ids.pop()

        prl, pcl = get_configs([("project_per_second_limit", 1000),
                                ("project_concurrent_limit", 1000)])

        # Specific projects can have their rate limits overridden
        (per_second, concurr) = get_configs([
            ("project_per_second_limit_{}".format(project_id), prl),
            ("project_concurrent_limit_{}".format(project_id), pcl),
        ])

        rate_limit = RateLimitParameters(
            rate_limit_name=PROJECT_RATE_LIMIT_NAME,
            bucket=str(project_id),
            per_second_limit=per_second,
            concurrent_limit=concurr,
        )

        request_settings.add_rate_limit(rate_limit)
예제 #3
0
    def process_query(
            self,
            query: Query,
            extension_data: ExtensionData,
            request_settings: RequestSettings,
    ) -> None:
        project_ids = util.to_list(extension_data['project'])

        if project_ids:
            query.add_conditions([('project_id', 'IN', project_ids)])

        request_settings.add_rate_limit(self._get_rate_limit_params(project_ids))

        self.do_post_processing(project_ids, query, request_settings)
예제 #4
0
    def test_without_turbo_without_projects_needing_final(self):
        request_settings = RequestSettings(turbo=False, consistent=False, debug=False)

        self.extension.get_processor().process_query(self.query, self.valid_data, request_settings)

        assert self.query.get_conditions() == [('project_id', 'IN', [2])]
        assert not self.query.get_final()
예제 #5
0
파일: test_split.py 프로젝트: Appva/snuba
def test_no_split(dataset_name: str):
    events = get_dataset(dataset_name)
    query = Query(
        {
            "selected_columns": ["event_id"],
            "conditions": [""],
            "orderby": "event_id",
            "sample": 10,
            "limit": 100,
            "offset": 50,
        },
        events.get_dataset_schemas().get_read_schema().get_data_source()
    )

    @split_query
    def do_query(dataset: Dataset, request: Request, timer: Timer):
        assert request.query == query

    request = Request(
        query,
        RequestSettings(False, False, False),
        {},
    )

    do_query(events, request, None)
예제 #6
0
 def do_post_processing(
     self,
     project_ids: Sequence[int],
     query: Query,
     request_settings: RequestSettings,
 ) -> None:
     if not request_settings.get_turbo():
         final, exclude_group_ids = get_projects_query_flags(
             project_ids, self.__replacer_state_name)
         if not final and exclude_group_ids:
             # If the number of groups to exclude exceeds our limit, the query
             # should just use final instead of the exclusion set.
             max_group_ids_exclude = get_config(
                 "max_group_ids_exclude",
                 settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE)
             if len(exclude_group_ids) > max_group_ids_exclude:
                 query.set_final(True)
             else:
                 query.add_conditions([(["assumeNotNull", ["group_id"]],
                                        "NOT IN", exclude_group_ids)])
                 query.add_condition_to_ast(
                     not_in_condition(
                         None,
                         FunctionCall(None, "assumeNotNull",
                                      (Column(None, "group_id", None), )),
                         [Literal(None, p) for p in exclude_group_ids],
                     ))
         else:
             query.set_final(final)
예제 #7
0
파일: schema.py 프로젝트: denisgolius/snuba
    def validate(self, value) -> Request:
        value = validate_jsonschema(value, self.__composite_schema)

        query_body = {
            key: value.pop(key)
            for key in self.__query_schema['properties'].keys() if key in value
        }
        settings = {
            key: value.pop(key)
            for key in self.__settings_schema['properties'].keys()
            if key in value
        }

        extensions = {}
        for extension_name, extension_schema in self.__extension_schemas.items(
        ):
            extensions[extension_name] = {
                key: value.pop(key)
                for key in extension_schema['properties'].keys()
                if key in value
            }

        return Request(
            Query(query_body),
            RequestSettings(settings['turbo'], settings['consistent'],
                            settings['debug']), extensions)
예제 #8
0
def test_query_extension_processing(
    raw_data: dict,
    expected_conditions: Sequence[Condition],
    expected_granularity: int,
):
    state.set_config('max_days', 1)
    extension = TimeSeriesExtension(
        default_granularity=60,
        default_window=datetime.timedelta(days=5),
        timestamp_column='timestamp',
    )
    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query(
        {"conditions": []},
        TableSource("my_table", ColumnSet([])),
    )

    request_settings = RequestSettings(turbo=False,
                                       consistent=False,
                                       debug=False)

    extension.get_processor().process_query(query, valid_data,
                                            request_settings)
    assert query.get_conditions() == expected_conditions
    assert query.get_granularity() == expected_granularity
예제 #9
0
def test_join_optimizer_two_tables(
    selected_cols: Sequence[Any],
    conditions: Sequence[Condition],
    groupby: Groupby,
    expected: str,
) -> None:
    query = Query(
        {
            "selected_columns": selected_cols,
            "conditions": conditions,
            "arrayjoin": None,
            "having": [],
            "groupby": groupby,
            "aggregations": [],
            "orderby": None,
            "limitby": None,
            "sample": 10,
            "limit": 100,
            "offset": 50,
            "totals": True,
            "granularity": 60,
        },
        simple_join_structure,
    )
    request_settings = RequestSettings(turbo=False,
                                       consistent=False,
                                       debug=False)

    optimizer = SimpleJoinOptimizer()
    optimizer.process_query(query, request_settings)

    assert query.get_data_source().format_from() == expected
예제 #10
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        table_name = query.get_from_clause().table_name
        (per_second, concurr) = get_configs([
            (f"table_per_second_limit_{table_name}", 1000),
            (f"table_concurrent_limit_{table_name}", 1000),
        ])

        rate_limit = RateLimitParameters(
            rate_limit_name=TABLE_RATE_LIMIT_NAME,
            bucket=table_name,
            per_second_limit=per_second,
            concurrent_limit=concurr,
        )

        request_settings.add_rate_limit(rate_limit)
예제 #11
0
def execute_query_with_rate_limits(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    # XXX: We should consider moving this that it applies to the logical query,
    # not the physical query.
    with RateLimitAggregator(request_settings.get_rate_limit_params()
                             ) as rate_limit_stats_container:
        stats.update(rate_limit_stats_container.to_dict())
        timer.mark("rate_limit")

        project_rate_limit_stats = rate_limit_stats_container.get_stats(
            PROJECT_RATE_LIMIT_NAME)

        if ("max_threads" in query_settings
                and project_rate_limit_stats is not None
                and project_rate_limit_stats.concurrent > 1):
            maxt = query_settings["max_threads"]
            query_settings["max_threads"] = max(
                1, maxt - project_rate_limit_stats.concurrent + 1)

        return execute_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            stats,
            query_settings,
        )
예제 #12
0
    def process_query(
        self,
        query: Query,
        extension_data: ExtensionData,
        request_settings: RequestSettings,
    ) -> None:
        project_ids = util.to_list(extension_data["project"])

        if project_ids:
            query.add_condition_to_ast(
                in_condition(
                    Column(None, None, self.__project_column),
                    [Literal(None, p) for p in project_ids],
                ))

        request_settings.add_rate_limit(
            self._get_rate_limit_params(project_ids))
예제 #13
0
    def select_storage(self, query: Query,
                       request_settings: RequestSettings) -> StorageAndMappers:
        use_readonly_storage = (state.get_config(
            "enable_events_readonly_table", False)
                                and not request_settings.get_consistent())

        storage = (self.__events_ro_table
                   if use_readonly_storage else self.__events_table)
        return StorageAndMappers(storage, event_translator)
예제 #14
0
    def test_when_there_are_too_many_groups_to_exclude(self):
        request_settings = RequestSettings(turbo=False, consistent=False, debug=False)
        state.set_config('max_group_ids_exclude', 2)
        replacer.set_project_exclude_groups(2, [100, 101, 102])

        self.extension.get_processor().process_query(self.query, self.valid_data, request_settings)

        assert self.query.get_conditions() == [('project_id', 'IN', [2])]
        assert self.query.get_final()
예제 #15
0
def test_project_extension_query_processing(raw_data: dict, expected_conditions: Sequence[Condition]):
    extension = ProjectExtension(
        processor=ProjectExtensionProcessor()
    )
    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query({
        "conditions": []
    })
    request_settings = RequestSettings(turbo=False, consistent=False, debug=False)

    extension.get_processor().process_query(query, valid_data, request_settings)

    assert query.get_conditions() == expected_conditions
def test_organization_extension_query_processing_happy_path():
    extension = OrganizationExtension()
    raw_data = {"organization": 2}

    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query({"conditions": []})
    request_settings = RequestSettings(turbo=False,
                                       consistent=False,
                                       debug=False)

    extension.get_processor().process_query(query, valid_data,
                                            request_settings)

    assert query.get_conditions() == [("org_id", "=", 2)]
예제 #17
0
def test_project_extension_project_rate_limits_are_overridden():
    extension = ProjectExtension(
        processor=ProjectExtensionProcessor()
    )
    raw_data = {
        'project': [2, 3]
    }
    valid_data = validate_jsonschema(raw_data, extension.get_schema())
    query = Query({
        'conditions': []
    })
    request_settings = RequestSettings(turbo=False, consistent=False, debug=False)
    state.set_config('project_per_second_limit_2', 5)
    state.set_config('project_concurrent_limit_2', 10)

    extension.get_processor().process_query(query, valid_data, request_settings)

    rate_limits = request_settings.get_rate_limit_params()
    most_recent_rate_limit = rate_limits[-1]

    assert most_recent_rate_limit.bucket == '2'
    assert most_recent_rate_limit.per_second_limit == 5
    assert most_recent_rate_limit.concurrent_limit == 10
예제 #18
0
def execute_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Query,
    request_settings: RequestSettings,
    formatted_query: SqlQuery,
    reader: Reader[SqlQuery],
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    """
    Execute a query and return a result.
    """
    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in ClickHouse.
    uc_max = state.get_config("uncompressed_cache_max_cols", 5)
    if (len(
            set((
                # Skip aliases when counting columns
                (c.table_name, c.column_name)
                for c in clickhouse_query.get_all_ast_referenced_columns()))) >
            uc_max):
        query_settings["use_uncompressed_cache"] = 0

    # Force query to use the first shard replica, which
    # should have synchronously received any cluster writes
    # before this query is run.
    consistent = request_settings.get_consistent()
    stats["consistent"] = consistent
    if consistent:
        query_settings["load_balancing"] = "in_order"
        query_settings["max_threads"] = 1

    result = reader.execute(
        formatted_query,
        query_settings,
        with_totals=clickhouse_query.has_totals(),
    )

    timer.mark("execute")
    stats.update({
        "result_rows": len(result["data"]),
        "result_cols": len(result["meta"])
    })

    return result
예제 #19
0
def execute_query(
    # TODO: Passing the whole clickhouse query here is needed as long
    # as the execute method depends on it. Otherwise we can make this
    # file rely either entirely on clickhouse query or entirely on
    # the formatter.
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    """
    Execute a query and return a result.
    """
    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in ClickHouse.
    uc_max = state.get_config("uncompressed_cache_max_cols", 5)
    assert isinstance(uc_max, int)
    column_counter = ReferencedColumnsCounter()
    column_counter.visit(clickhouse_query.get_from_clause())
    if column_counter.count_columns() > uc_max:
        query_settings["use_uncompressed_cache"] = 0

    # Force query to use the first shard replica, which
    # should have synchronously received any cluster writes
    # before this query is run.
    consistent = request_settings.get_consistent()
    stats["consistent"] = consistent
    if consistent:
        query_settings["load_balancing"] = "in_order"
        query_settings["max_threads"] = 1

    result = reader.execute(
        formatted_query,
        query_settings,
        with_totals=clickhouse_query.has_totals(),
        robust=robust,
    )

    timer.mark("execute")
    stats.update({
        "result_rows": len(result["data"]),
        "result_cols": len(result["meta"])
    })

    return result
예제 #20
0
def _apply_turbo_sampling_if_needed(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
) -> None:
    """
    TODO: Remove this method entirely and move the sampling logic
    into a query processor.
    """
    if isinstance(clickhouse_query, Query):
        if (request_settings.get_turbo()
                and not clickhouse_query.get_from_clause().sampling_rate):
            clickhouse_query.set_from_clause(
                replace(
                    clickhouse_query.get_from_clause(),
                    sampling_rate=snuba_settings.TURBO_SAMPLE_RATE,
                ))
예제 #21
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        if request_settings.get_turbo():
            return

        project_ids = get_project_ids_in_query_ast(query,
                                                   self.__project_column)

        set_final = False
        condition_to_add = None
        if project_ids:
            final, exclude_group_ids = get_projects_query_flags(
                list(project_ids),
                self.__replacer_state_name,
            )
            if final:
                metrics.increment("final", tags={"cause": "final_flag"})
            if not final and exclude_group_ids:
                # If the number of groups to exclude exceeds our limit, the query
                # should just use final instead of the exclusion set.
                max_group_ids_exclude = get_config(
                    "max_group_ids_exclude",
                    settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE)
                if len(exclude_group_ids) > max_group_ids_exclude:
                    metrics.increment("final", tags={"cause": "max_groups"})
                    set_final = True
                else:
                    condition_to_add = (
                        ["assumeNotNull", ["group_id"]],
                        "NOT IN",
                        exclude_group_ids,
                    )
                    query.add_condition_to_ast(
                        not_in_condition(
                            None,
                            FunctionCall(None, "assumeNotNull",
                                         (Column(None, None, "group_id"), )),
                            [Literal(None, p) for p in exclude_group_ids],
                        ))
            else:
                set_final = final

        query.set_final(set_final)
        if condition_to_add:
            query.add_conditions([condition_to_add])
예제 #22
0
 def do_post_processing(
         self,
         project_ids: Sequence[int],
         query: Query,
         request_settings: RequestSettings,
 ) -> None:
     if not request_settings.get_turbo():
         final, exclude_group_ids = get_projects_query_flags(project_ids)
         if not final and exclude_group_ids:
             # If the number of groups to exclude exceeds our limit, the query
             # should just use final instead of the exclusion set.
             max_group_ids_exclude = get_config('max_group_ids_exclude', settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE)
             if len(exclude_group_ids) > max_group_ids_exclude:
                 query.set_final(True)
             else:
                 query.add_conditions([(['assumeNotNull', ['group_id']], 'NOT IN', exclude_group_ids)])
         else:
             query.set_final(final)
예제 #23
0
파일: test_split.py 프로젝트: Appva/snuba
def test_col_split(
    dataset_name: str,
    first_query_data: Mapping[str, Any],
    second_query_data: Mapping[str, Any],
):

    @split_query
    def do_query(dataset: Dataset, request: Request, timer: Timer):
        selected_cols = request.query.get_selected_columns()
        if selected_cols == list(first_query_data[0].keys()):
            return QueryResult({"data": first_query_data}, 200)
        elif selected_cols == list(second_query_data[0].keys()):
            return QueryResult({"data": second_query_data}, 200)
        else:
            raise ValueError(f"Unexpected selected columns: {selected_cols}")

    events = get_dataset(dataset_name)
    query = Query(
        {
            "selected_columns": list(second_query_data[0].keys()),
            "conditions": [""],
            "orderby": "events.event_id",
            "sample": 10,
            "limit": 100,
            "offset": 50,
        },
        events.get_dataset_schemas().get_read_schema().get_data_source(),
    )

    request = Request(
        query,
        RequestSettings(False, False, False),
        {
            "project": {"project": 1},
            "timeseries": {
                "from_date": "2019-09-19T10:00:00",
                "to_date": "2019-09-19T12:00:00",
                "granularity": 3600,
            }
        },
    )

    do_query(events, request, None)
예제 #24
0
    def select_storage(self, query: Query,
                       request_settings: RequestSettings) -> StorageAndMappers:
        table = detect_table(
            query,
            self.__abstract_events_columns,
            self.__abstract_transactions_columns,
            True,
        )

        if table == TRANSACTIONS:
            return StorageAndMappers(self.__transactions_table,
                                     self.__transaction_translator)
        else:
            use_readonly_storage = (state.get_config(
                "enable_events_readonly_table", False)
                                    and not request_settings.get_consistent())
            return (StorageAndMappers(self.__events_ro_table,
                                      self.__event_translator)
                    if use_readonly_storage else StorageAndMappers(
                        self.__events_table, self.__event_translator))
예제 #25
0
def format_query(query: FormattableQuery,
                 settings: RequestSettings) -> FormattedQuery:
    """
    Formats a Clickhouse Query from the AST representation into an
    intermediate structure that can either be serialized into a string
    (for clickhouse) or extracted as a sequence (for logging and tracing).

    This is the entry point for any type of query, whether simple or
    composite.

    TODO: Remove this method entirely and move the sampling logic
    into a query processor.
    """

    if isinstance(query, Query):
        if settings.get_turbo() and not query.get_from_clause().sampling_rate:
            query.set_from_clause(
                replace(
                    query.get_from_clause(),
                    sampling_rate=snuba_settings.TURBO_SAMPLE_RATE,
                ))
    return FormattedQuery(_format_query_content(query))
예제 #26
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        readonly_enabled = state.get_config("enable_events_readonly_table",
                                            False)
        if not readonly_enabled:
            return

        if request_settings.get_consistent():
            return

        data_source = query.get_data_source()

        if data_source.format_from() != self.__table_to_replace:
            return

        new_source = TableSource(
            table_name=self.__read_only_table,
            columns=data_source.get_columns(),
            mandatory_conditions=data_source.get_mandatory_conditions(),
            prewhere_candidates=data_source.get_prewhere_candidates(),
        )
        query.set_data_source(new_source)
예제 #27
0
파일: events.py 프로젝트: isabella232/snuba
def callback_func(
    storage: str,
    query: Query,
    request_settings: RequestSettings,
    referrer: str,
    results: List[Result[QueryResult]],
) -> None:
    cache_hit = False
    is_duplicate = False

    # Captures if any of the queries involved was a cache hit or duplicate, as cache
    # hits may a cause of inconsistency between results.
    # Doesn't attempt to distinguish between all of the specific scenarios (one or both
    # queries, or splits of those queries could have hit the cache).
    if any([result.result.extra["stats"].get("cache_hit", 0) for result in results]):
        cache_hit = True
    elif any(
        [result.result.extra["stats"].get("is_duplicate", 0) for result in results]
    ):
        is_duplicate = True

    consistent = request_settings.get_consistent()

    if not results:
        metrics.increment(
            "query_result",
            tags={"storage": storage, "match": "empty", "referrer": referrer},
        )
        return

    primary_result = results.pop(0)
    primary_result_data = primary_result.result.result["data"]

    for result in results:
        result_data = result.result.result["data"]

        metrics.timing(
            "diff_ms",
            round((result.execution_time - primary_result.execution_time) * 1000),
            tags={
                "referrer": referrer,
                "cache_hit": str(cache_hit),
                "is_duplicate": str(is_duplicate),
                "consistent": str(consistent),
            },
        )

        # Do not bother diffing the actual results of sampled queries
        if request_settings.get_turbo() or query.get_sample() not in [None, 1.0]:
            return

        if result_data == primary_result_data:
            metrics.increment(
                "query_result",
                tags={
                    "storage": storage,
                    "match": "true",
                    "referrer": referrer,
                    "cache_hit": str(cache_hit),
                    "is_duplicate": str(is_duplicate),
                    "consistent": str(consistent),
                },
            )
        else:
            # Do not log cache hits to Sentry as it creates too much noise
            if cache_hit:
                continue

            reason = assign_reason_category(result_data, primary_result_data, referrer)

            metrics.increment(
                "query_result",
                tags={
                    "storage": storage,
                    "match": "false",
                    "referrer": referrer,
                    "reason": reason,
                    "cache_hit": str(cache_hit),
                    "is_duplicate": str(is_duplicate),
                    "consistent": str(consistent),
                },
            )

            if len(result_data) != len(primary_result_data):
                sentry_sdk.capture_message(
                    f"Non matching {storage} result - different length",
                    level="warning",
                    tags={
                        "referrer": referrer,
                        "storage": storage,
                        "reason": reason,
                        "cache_hit": str(cache_hit),
                        "is_duplicate": str(is_duplicate),
                        "consistent": str(consistent),
                    },
                    extras={
                        "query": format_query(query),
                        "primary_result": len(primary_result_data),
                        "other_result": len(result_data),
                    },
                )

                break

            # Avoid sending too much data to Sentry - just one row for now
            for idx in range(len(result_data)):
                if result_data[idx] != primary_result_data[idx]:
                    sentry_sdk.capture_message(
                        "Non matching result - different result",
                        level="warning",
                        tags={
                            "referrer": referrer,
                            "storage": storage,
                            "reason": reason,
                            "cache_hit": str(cache_hit),
                            "is_duplicate": str(is_duplicate),
                            "consistent": str(consistent),
                        },
                        extras={
                            "query": format_query(query),
                            "primary_result": primary_result_data[idx],
                            "other_result": result_data[idx],
                        },
                    )

                    break
예제 #28
0
파일: query.py 프로젝트: jiankunking/snuba
    def __init__(
        self,
        dataset: Dataset,
        query: Query,
        settings: RequestSettings,
    ) -> None:
        parsing_context = ParsingContext()

        aggregate_exprs = [
            column_expr(dataset, col, query, parsing_context, alias, agg)
            for (agg, col, alias) in query.get_aggregations()
        ]
        groupby = util.to_list(query.get_groupby())
        group_exprs = [
            column_expr(dataset, gb, query, parsing_context) for gb in groupby
        ]
        column_names = query.get_selected_columns() or []
        selected_cols = [
            column_expr(dataset, util.tuplify(colname), query, parsing_context)
            for colname in column_names
        ]
        select_clause = u"SELECT {}".format(
            ", ".join(group_exprs + aggregate_exprs + selected_cols))

        from_clause = u"FROM {}".format(query.get_data_source().format_from())

        if query.get_final():
            from_clause = u"{} FINAL".format(from_clause)

        if not query.get_data_source().supports_sample():
            sample_rate = None
        else:
            if query.get_sample():
                sample_rate = query.get_sample()
            elif settings.get_turbo():
                sample_rate = snuba_settings.TURBO_SAMPLE_RATE
            else:
                sample_rate = None

        if sample_rate:
            from_clause = u"{} SAMPLE {}".format(from_clause, sample_rate)

        join_clause = ""
        if query.get_arrayjoin():
            join_clause = u"ARRAY JOIN {}".format(query.get_arrayjoin())

        where_clause = ""
        if query.get_conditions():
            where_clause = u"WHERE {}".format(
                conditions_expr(dataset, query.get_conditions(), query,
                                parsing_context))

        prewhere_clause = ""
        if query.get_prewhere():
            prewhere_clause = u"PREWHERE {}".format(
                conditions_expr(dataset, query.get_prewhere(), query,
                                parsing_context))

        group_clause = ""
        if groupby:
            group_clause = "GROUP BY ({})".format(", ".join(
                column_expr(dataset, gb, query, parsing_context)
                for gb in groupby))
            if query.has_totals():
                group_clause = "{} WITH TOTALS".format(group_clause)

        having_clause = ""
        having_conditions = query.get_having()
        if having_conditions:
            assert groupby, "found HAVING clause with no GROUP BY"
            having_clause = u"HAVING {}".format(
                conditions_expr(dataset, having_conditions, query,
                                parsing_context))

        order_clause = ""
        if query.get_orderby():
            orderby = [
                column_expr(dataset, util.tuplify(ob), query, parsing_context)
                for ob in util.to_list(query.get_orderby())
            ]
            orderby = [
                u"{} {}".format(ob.lstrip("-"),
                                "DESC" if ob.startswith("-") else "ASC")
                for ob in orderby
            ]
            order_clause = u"ORDER BY {}".format(", ".join(orderby))

        limitby_clause = ""
        if query.get_limitby() is not None:
            limitby_clause = "LIMIT {} BY {}".format(*query.get_limitby())

        limit_clause = ""
        if query.get_limit() is not None:
            limit_clause = "LIMIT {}, {}".format(query.get_offset(),
                                                 query.get_limit())

        self.__formatted_query = " ".join([
            c for c in [
                select_clause,
                from_clause,
                join_clause,
                prewhere_clause,
                where_clause,
                group_clause,
                having_clause,
                order_clause,
                limitby_clause,
                limit_clause,
            ] if c
        ])
예제 #29
0
파일: query.py 프로젝트: Appva/snuba
    def __init__(
        self,
        dataset: Dataset,
        query: Query,
        settings: RequestSettings,
        prewhere_conditions: Sequence[str],
    ) -> None:
        parsing_context = ParsingContext()

        aggregate_exprs = [
            column_expr(dataset, col, query, parsing_context, alias, agg)
            for (agg, col, alias) in query.get_aggregations()
        ]
        groupby = util.to_list(query.get_groupby())
        group_exprs = [
            column_expr(dataset, gb, query, parsing_context) for gb in groupby
        ]
        column_names = query.get_selected_columns() or []
        selected_cols = [
            column_expr(dataset, util.tuplify(colname), query, parsing_context)
            for colname in column_names
        ]
        select_clause = u'SELECT {}'.format(
            ', '.join(group_exprs + aggregate_exprs + selected_cols))

        from_clause = u'FROM {}'.format(query.get_data_source().format_from())

        if query.get_final():
            from_clause = u'{} FINAL'.format(from_clause)

        if query.get_sample():
            sample_rate = query.get_sample()
        elif settings.get_turbo():
            sample_rate = snuba_settings.TURBO_SAMPLE_RATE
        else:
            sample_rate = None

        if sample_rate:
            from_clause = u'{} SAMPLE {}'.format(from_clause, sample_rate)

        join_clause = ''
        if query.get_arrayjoin():
            join_clause = u'ARRAY JOIN {}'.format(query.get_arrayjoin())

        where_clause = ''
        if query.get_conditions():
            where_clause = u'WHERE {}'.format(
                conditions_expr(dataset, query.get_conditions(), query,
                                parsing_context))

        prewhere_clause = ''
        if prewhere_conditions:
            prewhere_clause = u'PREWHERE {}'.format(
                conditions_expr(dataset, prewhere_conditions, query,
                                parsing_context))

        group_clause = ''
        if groupby:
            group_clause = 'GROUP BY ({})'.format(', '.join(
                column_expr(dataset, gb, query, parsing_context)
                for gb in groupby))
            if query.has_totals():
                group_clause = '{} WITH TOTALS'.format(group_clause)

        having_clause = ''
        having_conditions = query.get_having()
        if having_conditions:
            assert groupby, 'found HAVING clause with no GROUP BY'
            having_clause = u'HAVING {}'.format(
                conditions_expr(dataset, having_conditions, query,
                                parsing_context))

        order_clause = ''
        if query.get_orderby():
            orderby = [
                column_expr(dataset, util.tuplify(ob), query, parsing_context)
                for ob in util.to_list(query.get_orderby())
            ]
            orderby = [
                u'{} {}'.format(ob.lstrip('-'),
                                'DESC' if ob.startswith('-') else 'ASC')
                for ob in orderby
            ]
            order_clause = u'ORDER BY {}'.format(', '.join(orderby))

        limitby_clause = ''
        if query.get_limitby() is not None:
            limitby_clause = 'LIMIT {} BY {}'.format(*query.get_limitby())

        limit_clause = ''
        if query.get_limit() is not None:
            limit_clause = 'LIMIT {}, {}'.format(query.get_offset(),
                                                 query.get_limit())

        self.__formatted_query = ' '.join([
            c for c in [
                select_clause, from_clause, join_clause, prewhere_clause,
                where_clause, group_clause, having_clause, order_clause,
                limitby_clause, limit_clause
            ] if c
        ])