Exemplo n.º 1
0
def _filtered_mapping_pairs(
    alias: Optional[str],
    column_name: str,
    pair_alias: str,
    filtered_tags: Sequence[LiteralExpr],
    array_index: LiteralExpr,
) -> Expression:
    # (arrayJoin(arrayFilter(
    #       pair -> tupleElement(pair, 1) IN (tags),
    #       arrayMap((x,y) -> (x,y), tags.key, tags.value)
    #  )) as all_tags).1
    return tupleElement(
        alias,
        arrayJoin(
            pair_alias,
            filter_key_values(
                zip_columns(
                    ColumnExpr(None, None, key_column(column_name)),
                    ColumnExpr(None, None, val_column(column_name)),
                ),
                filtered_tags,
            ),
        ),
        array_index,
    )
Exemplo n.º 2
0
def build_mapping_expr(
    alias: Optional[str],
    table_name: Optional[str],
    col_name: str,
    mapping_key: Expression,
) -> FunctionCallExpr:
    return arrayElement(
        alias,
        ColumnExpr(None, table_name, f"{col_name}.value"),
        FunctionCallExpr(
            None,
            "indexOf",
            (ColumnExpr(None, table_name, f"{col_name}.key"), mapping_key),
        ),
    )
Exemplo n.º 3
0
def merge_mapper(name: str) -> ColumnToFunction:
    return ColumnToFunction(
        None,
        name,
        f"{name}Merge",
        (ColumnExpr(None, None, name), ),
    )
Exemplo n.º 4
0
def test_accessors() -> None:
    func = FunctionCall(
        String("f_name"),
        (
            FunctionCall(String("f"), (Column(None, String("my_col")), )),
            Param(
                "second_function",
                FunctionCall(Param("second_function_name", Any(str)), None),
            ),
        ),
    )

    result = func.match(
        FunctionCallExpr(
            "irrelevant",
            "f_name",
            (
                FunctionCallExpr(None, "f",
                                 (ColumnExpr(None, None, "my_col"), )),
                FunctionCallExpr(None, "second_name", tuple()),
            ),
        ))

    assert result is not None
    assert result.expression("second_function") == FunctionCallExpr(
        None, "second_name", tuple())
    assert result.scalar("second_function_name") == "second_name"
Exemplo n.º 5
0
def _unfiltered_mapping_pairs(alias: Optional[str], column_name: str,
                              pair_alias: str,
                              tuple_index: LiteralExpr) -> Expression:
    # (arrayJoin(
    #   arrayMap((x,y) -> (x,y), tags.key, tags.value)
    #  as all_tags).1
    return tupleElement(
        alias,
        arrayJoin(
            pair_alias,
            zip_columns(
                ColumnExpr(None, None, key_column(column_name)),
                ColumnExpr(None, None, val_column(column_name)),
            ),
        ),
        tuple_index,
    )
Exemplo n.º 6
0
def filtered_mapping_keys(alias: Optional[str], column_name: str,
                          filtered: Sequence[str]) -> Expression:
    return arrayJoin(
        alias,
        filter_column(
            ColumnExpr(None, None, column_name),
            [LiteralExpr(None, f) for f in filtered],
        ),
    )
Exemplo n.º 7
0
def _build_parameters(
    expression: Union[FunctionCall, CurriedFunctionCall],
    children_translator: SnubaClickhouseStrictTranslator,
    aggregated_col_name: str,
) -> Tuple[Expression, ...]:
    assert isinstance(expression.parameters[0], ColumnExpr)
    return (
        ColumnExpr(None, expression.parameters[0].table_name,
                   aggregated_col_name),
        *[p.accept(children_translator) for p in expression.parameters[1:]],
    )
Exemplo n.º 8
0
def _filtered_mapping_keys(alias: Optional[str], column_name: str,
                           filtered_tags: Sequence[LiteralExpr]) -> Expression:
    # arrayJoin(arrayFilter(
    #   tag -> tag IN (tags),
    #   tags.key
    # ))
    return arrayJoin(
        alias,
        filter_keys(ColumnExpr(None, None, key_column(column_name)),
                    filtered_tags),
    )
Exemplo n.º 9
0
def unfiltered_mapping_tuples(
    alias: Optional[str],
    tuple_alias: str,
    tuple_index: LiteralExpr,
    column_names: Sequence[str],
) -> Expression:
    return tupleElement(
        alias,
        arrayJoin(
            tuple_alias,
            zip_columns(
                *[ColumnExpr(None, None, column) for column in column_names]),
        ),
        tuple_index,
    )
Exemplo n.º 10
0
 def __init__(self) -> None:
     super().__init__(
         writable_storage_key=StorageKey.METRICS_COUNTERS_BUCKETS,
         readable_storage_key=StorageKey.METRICS_COUNTERS,
         value_schema=[
             Column("value", AggregateFunction("sum", [Float(64)]))
         ],
         mappers=TranslationMappers(columns=[
             ColumnToFunction(
                 None,
                 "value",
                 "sumMerge",
                 (ColumnExpr(None, None, "value"), ),
             ),
         ], ),
     )
Exemplo n.º 11
0
 def __init__(self) -> None:
     super().__init__(
         writable_storage_key=StorageKey.METRICS_BUCKETS,
         readable_storage_key=StorageKey.METRICS_SETS,
         value_schema=[
             Column("value", AggregateFunction("uniqCombined64",
                                               [UInt(64)])),
         ],
         mappers=TranslationMappers(columns=[
             ColumnToFunction(
                 None,
                 "value",
                 "uniqCombined64Merge",
                 (ColumnExpr(None, None, "value"), ),
             ),
         ], ),
     )
Exemplo n.º 12
0
def build_nullable_mapping_expr(
    alias: Optional[str],
    table_name: Optional[str],
    col_name: str,
    mapping_key: Expression,
) -> FunctionCallExpr:
    # TODO: Add a pattern for this expression if we need it.
    return FunctionCallExpr(
        alias,
        "if",
        (
            FunctionCallExpr(
                None,
                "has",
                (ColumnExpr(None, table_name, f"{col_name}.key"), mapping_key),
            ),
            build_mapping_expr(None, table_name, col_name, mapping_key),
            LiteralExpr(None, None),
        ),
    )
Exemplo n.º 13
0
def filtered_mapping_tuples(
    alias: Optional[str],
    tuple_alias: str,
    tuple_index: LiteralExpr,
    column_names: Sequence[str],
    single_filtered: Dict[LiteralExpr, Sequence[str]],
    multiple_filtered: Dict[Tuple[LiteralExpr, ...], Sequence[Tuple[str,
                                                                    ...]]],
) -> Expression:
    return tupleElement(
        alias,
        arrayJoin(
            tuple_alias,
            filter_expression(
                zip_columns(*[
                    ColumnExpr(None, None, column) for column in column_names
                ]),
                single_filtered,
                multiple_filtered,
            ),
        ),
        tuple_index,
    )
Exemplo n.º 14
0
 def __init__(self) -> None:
     super().__init__(
         writable_storage_key=StorageKey.METRICS_DISTRIBUTIONS_BUCKETS,
         readable_storage_key=StorageKey.METRICS_DISTRIBUTIONS,
         value_schema=[
             Column(
                 "percentiles",
                 AggregateFunction("quantiles(0.5, 0.75, 0.9, 0.95, 0.99)",
                                   [Float(64)]),
             ),
             Column("min", AggregateFunction("min", [Float(64)])),
             Column("max", AggregateFunction("max", [Float(64)])),
             Column("avg", AggregateFunction("avg", [Float(64)])),
             Column("sum", AggregateFunction("sum", [Float(64)])),
             Column("count", AggregateFunction("count", [Float(64)])),
         ],
         mappers=TranslationMappers(columns=[
             ColumnToCurriedFunction(
                 None,
                 "percentiles",
                 FunctionCall(
                     None,
                     "quantilesMerge",
                     tuple(
                         Literal(None, quant)
                         for quant in [0.5, 0.75, 0.9, 0.95, 0.99]),
                 ),
                 (ColumnExpr(None, None, "percentiles"), ),
             ),
             merge_mapper("min"),
             merge_mapper("max"),
             merge_mapper("avg"),
             merge_mapper("sum"),
             merge_mapper("count"),
         ], ),
     )
Exemplo n.º 15
0
def generate_bloom_filter_condition(
    column_name: str,
    single_filtered: Dict[str, Sequence[str]],
    multiple_filtered: Dict[Tuple[str, ...], Sequence[Tuple[str, ...]]],
) -> Optional[Expression]:
    """
    Generate the filters on the array columns to use the bloom filter index on
    the spans.op and spans.group columns in order to filter the transactions
    prior to the array join.

    The bloom filter index is requires the use of the has function, therefore
    the final condition is built up from a series of has conditions.
    """

    per_key_vals: Dict[str, Set[str]] = defaultdict(set)

    for key, single_filter in single_filtered.items():
        for val in single_filter:
            per_key_vals[key].add(val)

    for keys, multiple_filter in multiple_filtered.items():
        for val_tuple in multiple_filter:
            for key, val in zip(keys, val_tuple):
                per_key_vals[key].add(val)

    conditions = [
        combine_or_conditions([
            FunctionCallExpr(
                None,
                "has",
                (ColumnExpr(None, None, key), LiteralExpr(None, val)),
            ) for val in sorted(vals)
        ]) for key, vals in per_key_vals.items()
    ]

    return combine_and_conditions(conditions) if conditions else None
Exemplo n.º 16
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)
Exemplo n.º 17
0
    def execute(
        self,
        query: Query,
        query_settings: QuerySettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        selected_columns = {
            (col.table_name, col.column_name)
            for col in query.get_columns_referenced_in_select()
        }

        if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS:
            metrics.increment("column_splitter.main_query_min_threshold")
            return None

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(
                self.__id_column,
                ColumnExpr(self.__id_column, None, self.__id_column),
            ),
            SelectedExpression(
                self.__project_column,
                ColumnExpr(self.__project_column, None, self.__project_column),
            ),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(self.__timestamp_column, None,
                           self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT,
        # they fail on distributed tables. For that specific case, skip the query splitter.
        for orderby in minimal_query.get_orderby():
            if isinstance(orderby.expression,
                          (FunctionCallExpr, CurriedFunctionCallExpr)):
                metrics.increment("column_splitter.orderby_has_a_function")
                return None

        result = runner(minimal_query, query_settings)
        del minimal_query

        if not result.result["data"]:
            metrics.increment("column_splitter.no_data_from_minimal_query")
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        query.set_limit(len(result.result["data"]))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, query_settings)
Exemplo n.º 18
0
    Pattern,
    String,
)

test_cases = [
    ("Literal match", Literal(None), LiteralExpr("random_alias", 1), MatchResult(),),
    (
        "Literal match with none type",
        Literal(Any(type(None))),
        LiteralExpr("alias", 1),
        None,
    ),
    (
        "Single node match",
        Column(OptionalString("table"), String("test_col")),
        ColumnExpr("alias_we_don't_care_of", "table", "test_col"),
        MatchResult(),
    ),
    (
        "Single node no match",
        Column(None, String("test_col")),
        ColumnExpr(None, None, "not_a_test_col"),
        None,
    ),
    (
        "Matches a None table name",
        Column(Param("table_name", AnyOptionalString()), None),
        ColumnExpr(None, None, "not_a_test_col"),
        MatchResult({"table_name": None}),
    ),
    (
Exemplo n.º 19
0
 def _produce_output(self, expression: ColumnExpr) -> ColumnExpr:
     return ColumnExpr(
         alias=expression.alias,
         table_name=self.to_table_name,
         column_name=self.to_col_name,
     )
Exemplo n.º 20
0
    Expression,
    Literal as LiteralExpr,
)
from snuba.query.validation import InvalidFunctionCall
from snuba.query.validation.signature import (
    Any,
    Column,
    Literal,
    ParamType,
    SignatureValidator,
)

test_cases = [
    pytest.param(
        (
            ColumnExpr(alias=None, table_name=None, column_name="event_id"),
            LiteralExpr(None, "param"),
        ),
        [Any(), Any()],
        False,
        False,
        id="Valid Expression",
    ),
    pytest.param(
        (
            ColumnExpr(alias=None, table_name=None, column_name="event_id"),
            LiteralExpr(None, "param"),
        ),
        [Column({String}), Any()],
        False,
        False,
Exemplo n.º 21
0
 (
     "Literal match",
     Literal(None),
     LiteralExpr("random_alias", 1),
     MatchResult(),
 ),
 (
     "Literal match with none type",
     Literal(Any(type(None))),
     LiteralExpr("alias", 1),
     None,
 ),
 (
     "Single node match",
     Column(OptionalString("table"), String("test_col")),
     ColumnExpr("alias_we_don't_care_of", "table", "test_col"),
     MatchResult(),
 ),
 (
     "Single node no match",
     Column(None, String("test_col")),
     ColumnExpr(None, None, "not_a_test_col"),
     None,
 ),
 (
     "Matches a None table name",
     Column(Param("table_name", AnyOptionalString()), None),
     ColumnExpr(None, None, "not_a_test_col"),
     MatchResult({"table_name": None}),
 ),
 (