Exemplos de Column em Python, exemplos de snuba.query.matchers.Column em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: accessors.py Projeto: cafebazaar/snuba

    def get_project_ids_in_condition(
            condition: Expression) -> Optional[Set[int]]:
        """
        Extract project ids from an expression. Returns None if no project
        if condition is found. It returns an empty set of conflicting project_id
        conditions are found.
        """
        match = FunctionCall(
            None,
            String(ConditionFunctions.EQ),
            (
                Column(column_name=String(project_column)),
                Literal(value=Param("project_id", Any(int))),
            ),
        ).match(condition)
        if match is not None:
            return {match.integer("project_id")}

        match = is_in_condition_pattern(
            Column(column_name=String(project_column))).match(condition)
        if match is not None:
            projects = match.expression("tuple")
            assert isinstance(projects, FunctionCallExpr)
            return {
                l.value
                for l in projects.parameters
                if isinstance(l, LiteralExpr) and isinstance(l.value, int)
            }

        match = FunctionCall(
            None,
            Param(
                "operator",
                Or([String(BooleanFunctions.AND),
                    String(BooleanFunctions.OR)]),
            ),
            (Param("lhs", AnyExpression()), Param("rhs", AnyExpression())),
        ).match(condition)
        if match is not None:
            lhs_projects = get_project_ids_in_condition(
                match.expression("lhs"))
            rhs_projects = get_project_ids_in_condition(
                match.expression("rhs"))
            if lhs_projects is None:
                return rhs_projects
            elif rhs_projects is None:
                return lhs_projects
            else:
                return (lhs_projects & rhs_projects if match.string("operator")
                        == BooleanFunctions.AND else lhs_projects
                        | rhs_projects)

        return None

Exemplo n.º 2

0

Exibir arquivo

    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        matcher = FunctionCall(
            String("arrayElement"),
            (
                Column(
                    None,
                    String("contexts.value"),
                ),
                FunctionCall(
                    String("indexOf"),
                    (
                        Column(None, String("contexts.key")),
                        Literal(
                            Or([
                                String("device.simulator"),
                                String("device.online"),
                                String("device.charging"),
                            ]), ),
                    ),
                ),
            ),
        )

        def process_column(exp: Expression) -> Expression:
            match = matcher.match(exp)

            if match:
                inner = replace(exp, alias=None)
                return FunctionCallExpr(
                    exp.alias,
                    "if",
                    (
                        binary_condition(
                            ConditionFunctions.IN,
                            inner,
                            literals_tuple(
                                None,
                                [
                                    LiteralExpr(None, "1"),
                                    LiteralExpr(None, "True")
                                ],
                            ),
                        ),
                        LiteralExpr(None, "True"),
                        LiteralExpr(None, "False"),
                    ),
                )

            return exp

        query.transform_expressions(process_column)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: query_profiler.py Projeto: anthonynsimon/snuba

def _get_date_range(query: Query) -> Optional[int]:
    """
    Best guess to find the time range for the query.
    We pick the first column that is compared with a datetime Literal.
    """
    pattern = FunctionCall(
        Or([String(ConditionFunctions.GT),
            String(ConditionFunctions.GTE)]),
        (Column(None, Param("col_name", Any(str))), Literal(Any(datetime))),
    )

    condition = query.get_condition_from_ast()
    if condition is None:
        return None
    for exp in condition:
        result = pattern.match(exp)
        if result is not None:
            from_date, to_date = get_time_range(query,
                                                result.string("col_name"))
            if from_date is None or to_date is None:
                return None
            else:
                return (to_date - from_date).days

    return None

Exemplo n.º 4

0

Exibir arquivo

Arquivo: accessors.py Projeto: isabella232/snuba

def get_time_range_expressions(
    conditions: Sequence[Expression],
    timestamp_field: str,
    table_name: Optional[str] = None,
) -> Tuple[Optional[Tuple[datetime, FunctionCallExpr]], Optional[Tuple[
        datetime, FunctionCallExpr]], ]:
    max_lower_bound: Optional[Tuple[datetime, FunctionCallExpr]] = None
    min_upper_bound: Optional[Tuple[datetime, FunctionCallExpr]] = None
    table_match = String(table_name) if table_name else None
    for c in conditions:
        match = FunctionCall(
            Param(
                "operator",
                Or([
                    String(OPERATOR_TO_FUNCTION[">="]),
                    String(OPERATOR_TO_FUNCTION["<"]),
                ]),
            ),
            (
                Column(table_match, String(timestamp_field)),
                Literal(Param("timestamp", Any(datetime))),
            ),
        ).match(c)

        if match is not None:
            timestamp = cast(datetime, match.scalar("timestamp"))
            assert isinstance(c, FunctionCallExpr)
            if match.string("operator") == OPERATOR_TO_FUNCTION[">="]:
                if not max_lower_bound or timestamp > max_lower_bound[0]:
                    max_lower_bound = (timestamp, c)
            else:
                if not min_upper_bound or timestamp < min_upper_bound[0]:
                    min_upper_bound = (timestamp, c)

    return (max_lower_bound, min_upper_bound)

Exemplo n.º 5

0

Exibir arquivo

def test_accessors() -> None:
    func = FunctionCall(
        String("f_name"),
        (
            FunctionCall(String("f"), (Column(None, String("my_col")), )),
            Param(
                "second_function",
                FunctionCall(Param("second_function_name", Any(str)), None),
            ),
        ),
    )

    result = func.match(
        FunctionCallExpr(
            "irrelevant",
            "f_name",
            (
                FunctionCallExpr(None, "f",
                                 (ColumnExpr(None, None, "my_col"), )),
                FunctionCallExpr(None, "second_name", tuple()),
            ),
        ))

    assert result is not None
    assert result.expression("second_function") == FunctionCallExpr(
        None, "second_name", tuple())
    assert result.scalar("second_function_name") == "second_name"

Exemplo n.º 6

0

Exibir arquivo

def get_time_range_estimate(
    query: ProcessableQuery[Table],
) -> Tuple[Optional[datetime], Optional[datetime]]:
    """
    Best guess to find the time range for the query.
    We pick the first column that is compared with a datetime Literal.
    """
    pattern = FunctionCall(
        Or([String(ConditionFunctions.GT),
            String(ConditionFunctions.GTE)]),
        (Column(None, Param("col_name", Any(str))), Literal(Any(datetime))),
    )

    from_date, to_date = None, None
    condition = query.get_condition()
    if condition is None:
        return None, None
    for exp in condition:
        result = pattern.match(exp)
        if result is not None:
            from_date, to_date = get_time_range(query,
                                                result.string("col_name"))
            break

    return from_date, to_date

Exemplo n.º 7

0

Exibir arquivo

Arquivo: split.py Projeto: anthonynsimon/snuba

    def replace_condition(expression: Expression) -> Expression:
        match = FunctionCall(
            String(OPERATOR_TO_FUNCTION[operator]),
            (Param("column", Column(None, String(field))), AnyExpression()),
        ).match(expression)

        return (expression if match is None else replace(
            expression, parameters=(match.expression("column"), new_operand)))

Exemplo n.º 8

0

Exibir arquivo

        def replace_exp(exp: Expression) -> Expression:
            matcher = FunctionCall(
                String("notEquals"),
                (Column(None, String("type")), Literal(String("transaction"))),
            )

            if matcher.match(exp):
                return LiteralExpr(None, 1)

            return exp

Exemplo n.º 9

0

Exibir arquivo

Arquivo: arrayjoin_optimizer.py Projeto: getsentry/snuba

 def __init__(
     self,
     column_name: str,
     key_names: Sequence[str],
     val_names: Sequence[str],
 ):
     super().__init__(column_name, key_names, val_names)
     self.__array_join_pattern = FunctionCall(
         String("arrayJoin"),
         (Column(column_name=Param(
             "col",
             Or([String(column) for column in self.all_columns]),
         ), ), ),
     )

Exemplo n.º 10

0

Exibir arquivo

Arquivo: abstract_array_join_optimizer.py Projeto: getsentry/snuba

 def is_skippable_condition(conditions: Expression) -> bool:
     """
     A condition composed of a bunch of has(column, ...) conditions OR'ed together
     can be ignored when looking for filter keys because these are the conditions
     used for the bloom filter index on the array column.
     """
     for column_name in column_names:
         has_pattern = FunctionCall(
             String("has"),
             (Column(column_name=String(column_name)), Literal(Any(str))),
         )
         if all(
             has_pattern.match(c) for c in get_first_level_or_conditions(conditions)
         ):
             return True
     return False

Exemplo n.º 11

0

Exibir arquivo

Arquivo: events_bool_contexts.py Projeto: fpacifici/snuba

    def process_query(self, query: Query, request_settings: RequestSettings) -> None:
        # We care only of promoted contexts, so we do not need to match
        # the original nested expression.
        matcher = FunctionCall(
            String("toString"),
            (
                Column(
                    None,
                    Or(
                        [
                            String("device_simulator"),
                            String("device_online"),
                            String("device_charging"),
                        ]
                    ),
                ),
            ),
        )

        def replace_exp(exp: Expression) -> Expression:
            if matcher.match(exp) is not None:
                inner = replace(exp, alias=None)
                return FunctionCallExpr(
                    exp.alias,
                    "multiIf",
                    (
                        binary_condition(
                            None, ConditionFunctions.EQ, inner, Literal(None, "")
                        ),
                        Literal(None, ""),
                        binary_condition(
                            None,
                            ConditionFunctions.IN,
                            inner,
                            literals_tuple(
                                None, [Literal(None, "1"), Literal(None, "True")]
                            ),
                        ),
                        Literal(None, "True"),
                        Literal(None, "False"),
                    ),
                )
            return exp

        query.transform_expressions(replace_exp)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: accessors.py Projeto: cafebazaar/snuba

def get_time_range(
        query: Query,
        timestamp_field: str) -> Tuple[Optional[datetime], Optional[datetime]]:
    """
    Finds the minimal time range for this query. Which means, it finds
    the >= timestamp condition with the highest datetime literal and
    the < timestamp condition with the smallest and returns the interval
    in the form of a tuple of Literals. It only looks into first level
    AND conditions since, if the timestamp is nested in an OR we cannot
    say anything on how that compares to the other timestamp conditions.
    """

    condition_clause = query.get_condition_from_ast()
    if not condition_clause:
        return (None, None)

    max_lower_bound = None
    min_upper_bound = None
    for c in get_first_level_and_conditions(condition_clause):
        match = FunctionCall(
            None,
            Param(
                "operator",
                Or([
                    String(OPERATOR_TO_FUNCTION[">="]),
                    String(OPERATOR_TO_FUNCTION["<"]),
                ]),
            ),
            (
                Column(None, None, String(timestamp_field)),
                Literal(None, Param("timestamp", Any(datetime))),
            ),
        ).match(c)

        if match is not None:
            timestamp = cast(datetime, match.scalar("timestamp"))
            if match.string("operator") == OPERATOR_TO_FUNCTION[">="]:
                if not max_lower_bound or timestamp > max_lower_bound:
                    max_lower_bound = timestamp
            else:
                if not min_upper_bound or timestamp < min_upper_bound:
                    min_upper_bound = timestamp

    return (max_lower_bound, min_upper_bound)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: array_has_optimizer.py Projeto: getsentry/snuba

 def __init__(self, array_columns: Sequence[str]):
     self.__array_has_pattern = FunctionCall(
         String("equals"),
         (
             Param(
                 "has",
                 FunctionCall(
                     String("has"),
                     (
                         Column(
                             column_name=Or(
                                 [String(column) for column in array_columns]
                             )
                         ),
                         Literal(Any(str)),
                     ),
                 ),
             ),
             Literal(Integer(1)),
         ),
     )

Exemplo n.º 14

0

Exibir arquivo

def test_subscription_worker(subscription_data: SubscriptionData) -> None:
    broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(),
                                                    TestingClock())

    result_topic = Topic("subscription-results")

    broker.create_topic(result_topic, partitions=1)

    frequency = timedelta(minutes=1)
    evaluations = 3

    subscription = Subscription(
        SubscriptionIdentifier(PartitionId(0), uuid1()),
        subscription_data,
    )

    store = DummySubscriptionDataStore()
    store.create(subscription.identifier.uuid, subscription.data)

    metrics = DummyMetricsBackend(strict=True)

    dataset = get_dataset("events")
    worker = SubscriptionWorker(
        dataset,
        ThreadPoolExecutor(),
        {
            0: SubscriptionScheduler(store, PartitionId(0), timedelta(),
                                     metrics)
        },
        broker.get_producer(),
        result_topic,
        metrics,
    )

    now = datetime(2000, 1, 1)

    tick = Tick(
        offsets=Interval(0, 1),
        timestamps=Interval(now - (frequency * evaluations), now),
    )

    result_futures = worker.process_message(
        Message(Partition(Topic("events"), 0), 0, tick, now))

    assert result_futures is not None and len(result_futures) == evaluations

    # Publish the results.
    worker.flush_batch([result_futures])

    # Check to make sure the results were published.
    # NOTE: This does not cover the ``SubscriptionTaskResultCodec``!
    consumer = broker.get_consumer("group")
    consumer.subscribe([result_topic])

    for i in range(evaluations):
        timestamp = now - frequency * (evaluations - i)

        message = consumer.poll()
        assert message is not None
        assert message.partition.topic == result_topic

        task, future = result_futures[i]
        future_result = request, result = future.result()
        assert message.payload.task.timestamp == timestamp
        assert message.payload == SubscriptionTaskResult(task, future_result)

        # NOTE: The time series extension is folded back into the request
        # body, ideally this would reference the timeseries options in
        # isolation.
        from_pattern = FunctionCall(
            String(ConditionFunctions.GTE),
            (
                Column(None, String("timestamp")),
                Literal(Datetime(timestamp - subscription.data.time_window)),
            ),
        )
        to_pattern = FunctionCall(
            String(ConditionFunctions.LT),
            (Column(None, String("timestamp")), Literal(Datetime(timestamp))),
        )

        condition = request.query.get_condition()
        assert condition is not None

        conditions = get_first_level_and_conditions(condition)

        assert any([from_pattern.match(e) for e in conditions])
        assert any([to_pattern.match(e) for e in conditions])

        assert result == {
            "meta": [{
                "name": "count",
                "type": "UInt64"
            }],
            "data": [{
                "count": 0
            }],
        }

Exemplo n.º 15

0

Exibir arquivo

Arquivo: abstract_array_join_optimizer.py Projeto: getsentry/snuba

def _array_join_pattern(column_name: str) -> FunctionCall:
    return FunctionCall(
        String("arrayJoin"),
        (Column(column_name=String(column_name)),),
    )

Exemplo n.º 16

0

Exibir arquivo

    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        arrayjoin_pattern = FunctionCall(
            String("arrayJoin"),
            (Column(column_name=Param(
                "col",
                Or([
                    String(key_column(self.__column_name)),
                    String(val_column(self.__column_name)),
                ]),
            ), ), ),
        )

        arrayjoins_in_query = set()
        for e in query.get_all_expressions():
            match = arrayjoin_pattern.match(e)
            if match is not None:
                arrayjoins_in_query.add(match.string("col"))

        filtered_keys = [
            LiteralExpr(None, key)
            for key in get_filtered_mapping_keys(query, self.__column_name)
        ]

        # Ensures the alias we apply to the arrayJoin is not already taken.
        used_aliases = {exp.alias for exp in query.get_all_expressions()}
        pair_alias_root = f"snuba_all_{self.__column_name}"
        pair_alias = pair_alias_root
        index = 0
        while pair_alias in used_aliases:
            index += 1
            pair_alias = f"{pair_alias_root}_{index}"

        def replace_expression(expr: Expression) -> Expression:
            """
            Applies the appropriate optimization on a single arrayJoin expression.
            """
            match = arrayjoin_pattern.match(expr)
            if match is None:
                return expr

            if arrayjoins_in_query == {
                    key_column(self.__column_name),
                    val_column(self.__column_name),
            }:
                # Both arrayJoin(col.key) and arrayJoin(col.value) expressions
                # present int the query. Do the arrayJoin on key-value pairs
                # instead of independent arrayjoin for keys and values.
                array_index = (LiteralExpr(
                    None, 1) if match.string("col") == key_column(
                        self.__column_name) else LiteralExpr(None, 2))

                if not filtered_keys:
                    return _unfiltered_mapping_pairs(expr.alias,
                                                     self.__column_name,
                                                     pair_alias, array_index)
                else:
                    return _filtered_mapping_pairs(
                        expr.alias,
                        self.__column_name,
                        pair_alias,
                        filtered_keys,
                        array_index,
                    )

            elif filtered_keys:
                # Only one between arrayJoin(col.key) and arrayJoin(col.value)
                # is present, and it is arrayJoin(col.key) since we found
                # filtered keys.
                return _filtered_mapping_keys(expr.alias, self.__column_name,
                                              filtered_keys)
            else:
                # No viable optimization
                return expr

        query.transform_expressions(replace_expression)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: mappers.py Projeto: cafebazaar/snuba

            LiteralExpr(None, None),
        ),
    )


TABLE_MAPPING_PARAM = "table_name"
VALUE_COL_MAPPING_PARAM = "value_column"
KEY_COL_MAPPING_PARAM = "key_column"
KEY_MAPPING_PARAM = "key"
mapping_pattern = FunctionCall(
    None,
    String("arrayElement"),
    (
        Column(
            None,
            Param(TABLE_MAPPING_PARAM, AnyOptionalString()),
            Param(VALUE_COL_MAPPING_PARAM, Any(str)),
        ),
        FunctionCall(
            None,
            String("indexOf"),
            (
                Column(None, None, Param(KEY_COL_MAPPING_PARAM, Any(str))),
                Literal(None, Param(KEY_MAPPING_PARAM, Any(str))),
            ),
        ),
    ),
)

# TODO: build more of these mappers.

Exemplo n.º 18

0

Exibir arquivo

test_cases = [
    (
        "Literal match",
        Literal(None),
        LiteralExpr("random_alias", 1),
        MatchResult(),
    ),
    (
        "Literal match with none type",
        Literal(Any(type(None))),
        LiteralExpr("alias", 1),
        None,
    ),
    (
        "Single node match",
        Column(OptionalString("table"), String("test_col")),
        ColumnExpr("alias_we_don't_care_of", "table", "test_col"),
        MatchResult(),
    ),
    (
        "Single node no match",
        Column(None, String("test_col")),
        ColumnExpr(None, None, "not_a_test_col"),
        None,
    ),
    (
        "Matches a None table name",
        Column(Param("table_name", AnyOptionalString()), None),
        ColumnExpr(None, None, "not_a_test_col"),
        MatchResult({"table_name": None}),
    ),

Exemplo n.º 19

0

Exibir arquivo

Arquivo: split.py Projeto: getsentry/snuba

    def execute(
        self,
        query: Query,
        query_settings: QuerySettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        selected_columns = {
            (col.table_name, col.column_name)
            for col in query.get_columns_referenced_in_select()
        }

        if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS:
            metrics.increment("column_splitter.main_query_min_threshold")
            return None

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(
                self.__id_column,
                ColumnExpr(self.__id_column, None, self.__id_column),
            ),
            SelectedExpression(
                self.__project_column,
                ColumnExpr(self.__project_column, None, self.__project_column),
            ),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(self.__timestamp_column, None,
                           self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT,
        # they fail on distributed tables. For that specific case, skip the query splitter.
        for orderby in minimal_query.get_orderby():
            if isinstance(orderby.expression,
                          (FunctionCallExpr, CurriedFunctionCallExpr)):
                metrics.increment("column_splitter.orderby_has_a_function")
                return None

        result = runner(minimal_query, query_settings)
        del minimal_query

        if not result.result["data"]:
            metrics.increment("column_splitter.no_data_from_minimal_query")
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        query.set_limit(len(result.result["data"]))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, query_settings)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: split.py Projeto: anthonynsimon/snuba

    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: arrayjoin_keyvalue_optimizer.py Projeto: cafebazaar/snuba

def array_join_pattern(column_name: str) -> FunctionCall:
    return FunctionCall(
        None,
        String("arrayJoin"),
        (Column(column_name=String(key_column(column_name))),),
    )