Пример #1
0
    def __tag_expr(
        self,
        parsed_col: ParsedNestedColumn,
        table_alias: str = "",
    ) -> str:
        """
        Return an expression for the value of a single named tag.

        For tags/contexts, we expand the expression depending on whether the tag is
        "promoted" to a top level column, or whether we have to look in the tags map.
        """
        # For promoted tags, return the column name.
        assert parsed_col.tag_name
        tag_name = parsed_col.tag_name
        col = parsed_col.col_name
        if col in self.__promoted_columns:
            actual_tag = self.__get_tag_column_map()[col].get(
                tag_name, tag_name)
            if actual_tag in self.__promoted_columns[col]:
                return qualified_column(self.__string_col(actual_tag),
                                        table_alias)

        # For the rest, return an expression that looks it up in the nested tags.
        return "{col}.value[indexOf({col}.key, {tag})]".format(
            **{
                "col": qualified_column(col, table_alias),
                "tag": escape_literal(tag_name),
            })
Пример #2
0
    def __tag_expr(
        self,
        column_name: str,
        table_alias: str = "",
    ) -> str:
        """
        Return an expression for the value of a single named tag.

        For tags/contexts, we expand the expression depending on whether the tag is
        "promoted" to a top level column, or whether we have to look in the tags map.
        """
        col, tag = NESTED_COL_EXPR_RE.match(column_name).group(1, 2)
        # For promoted tags, return the column name.
        if col in self.__promoted_columns:
            actual_tag = self.__get_tag_column_map()[col].get(tag, tag)
            if actual_tag in self.__promoted_columns[col]:
                return qualified_column(self.__string_col(actual_tag),
                                        table_alias)

        # For the rest, return an expression that looks it up in the nested tags.
        return u'{col}.value[indexOf({col}.key, {tag})]'.format(
            **{
                'col': qualified_column(col, table_alias),
                'tag': escape_literal(tag)
            })
Пример #3
0
    def __tags_expr(self,
        column_name: str,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str="",
    ) -> str:
        """
        Return an expression that array-joins on tags to produce an output with one
        row per tag.
        """
        assert column_name in ['tags_key', 'tags_value']
        col, k_or_v = column_name.split('_', 1)
        nested_tags_only = state.get_config('nested_tags_only', 1)

        qualified_col = qualified_column(col, table_alias)
        # Generate parallel lists of keys and values to arrayJoin on
        if nested_tags_only:
            key_list = '{}.key'.format(qualified_col)
            val_list = '{}.value'.format(qualified_col)
        else:
            promoted = self.__promoted_columns[col]
            col_map = self.__column_tag_map[col]
            key_list = u'arrayConcat([{}], {}.key)'.format(
                u', '.join(u'\'{}\''.format(col_map.get(p, p)) for p in promoted),
                qualified_col
            )
            val_list = u'arrayConcat([{}], {}.value)'.format(
                ', '.join(self.__string_col(p) for p in promoted),
                qualified_col
            )

        qualified_key = qualified_column("tags_key", table_alias)
        qualified_value = qualified_column("tags_value", table_alias)
        cols_used = query.get_all_referenced_columns() & set([qualified_key, qualified_value])
        if len(cols_used) == 2:
            # If we use both tags_key and tags_value in this query, arrayjoin
            # on (key, value) tag tuples.
            expr = (u'arrayJoin(arrayMap((x,y) -> [x,y], {}, {}))').format(
                key_list,
                val_list
            )

            # put the all_tags expression in the alias cache so we can use the alias
            # to refer to it next time (eg. 'all_tags[1] AS tags_key'). instead of
            # expanding the whole tags expression again.
            expr = alias_expr(expr, 'all_tags', parsing_context)
            return u'({})[{}]'.format(expr, 1 if k_or_v == 'key' else 2)
        else:
            # If we are only ever going to use one of tags_key or tags_value, don't
            # bother creating the k/v tuples to arrayJoin on, or the all_tags alias
            # to re-use as we won't need it.
            return 'arrayJoin({})'.format(key_list if k_or_v == 'key' else val_list)
Пример #4
0
 def column_expr(self, column_name, query: Query, parsing_context: ParsingContext, table_alias: str=""):
     processed_column = self.__tags_processor.process_column_expression(column_name, query, parsing_context, table_alias)
     if processed_column:
         # If processed_column is None, this was not a tag/context expression
         return processed_column
     elif column_name == 'issue' or column_name == 'group_id':
         return f"nullIf({qualified_column('group_id', table_alias)}, 0)"
     elif column_name == 'message':
         # Because of the rename from message->search_message without backfill,
         # records will have one or the other of these fields.
         # TODO this can be removed once all data has search_message filled in.
         search_message = qualified_column('search_message', table_alias)
         message = qualified_column('message', table_alias)
         return f"coalesce({search_message}, {message})"
     else:
         return super().column_expr(column_name, query, parsing_context, table_alias)
Пример #5
0
    def column_expr(
        self,
        column_name: str,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ) -> Union[None, Any]:
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias)
        if processed_column:
            # If processed_column is None, this was not a tag/context expression

            # This conversion must not be ported to the errors dataset. We should
            # not support promoting tags/contexts with boolean values. There is
            # no way to convert them back consistently to the value provided by
            # the client when the event is ingested, in all ways to access
            # tags/contexts. Once the errors dataset is in use, we will not have
            # boolean promoted tags/contexts so this constraint will be easy to enforce.
            boolean_contexts = {
                "contexts[device.simulator]",
                "contexts[device.online]",
                "contexts[device.charging]",
            }
            boolean_context_template = (
                "multiIf(equals(%(processed_column)s, ''), '', "
                "in(%(processed_column)s, tuple('1', 'True')), 'True', 'False')"
            )
            if column_name in boolean_contexts:
                return boolean_context_template % ({
                    "processed_column":
                    processed_column
                })
            return processed_column
        elif column_name == "group_id":
            return f"nullIf({qualified_column('group_id', table_alias)}, 0)"
        elif column_name == "message":
            # Because of the rename from message->search_message without backfill,
            # records will have one or the other of these fields.
            # TODO this can be removed once all data has search_message filled in.
            search_message = qualified_column("search_message", table_alias)
            message = qualified_column("message", table_alias)
            return f"coalesce({search_message}, {message})"
        else:
            return super().column_expr(column_name, query, parsing_context,
                                       table_alias)
Пример #6
0
 def __time_expr(self, column_name: str, granularity: int, table_alias: str="") -> str:
     real_column = self.__time_group_columns[column_name]
     real_column = qualified_column(real_column, table_alias)
     template = {
         3600: 'toStartOfHour({column})',
         60: 'toStartOfMinute({column})',
         86400: 'toDate({column})',
     }.get(granularity, 'toDateTime(intDiv(toUInt32({column}), {granularity}) * {granularity})')
     return template.format(column=real_column, granularity=granularity)
Пример #7
0
 def column_expr(self,
                 column_name,
                 query: Query,
                 parsing_context: ParsingContext,
                 table_alias: str = ""):
     """
     Return an expression for the column name. Handle special column aliases
     that evaluate to something else.
     """
     return escape_col(qualified_column(column_name, table_alias))
Пример #8
0
 def time_expr(self, column_name: str, granularity: int,
               table_alias: str) -> str:
     real_column = qualified_column(column_name, table_alias)
     template = {
         3600: "toStartOfHour({column})",
         60: "toStartOfMinute({column})",
         86400: "toDate({column})",
     }.get(
         granularity,
         "toDateTime(intDiv(toUInt32({column}), {granularity}) * {granularity})",
     )
     return template.format(column=real_column, granularity=granularity)
Пример #9
0
 def attempt_map(
     self, expression: Column, children_translator: SnubaClickhouseStrictTranslator,
 ) -> Optional[Literal]:
     if expression.column_name in self.columns:
         return Literal(
             alias=expression.alias
             or qualified_column(
                 expression.column_name, expression.table_name or ""
             ),
             value=None,
         )
     else:
         return None
Пример #10
0
 def attempt_map(
     self,
     expression: Column,
     children_translator: SnubaClickhouseStrictTranslator,
 ) -> Optional[FunctionCall]:
     if expression.column_name in self.columns:
         return identity(
             Literal(None, None),
             expression.alias or qualified_column(
                 expression.column_name, expression.table_name or ""),
         )
     else:
         return None
Пример #11
0
    def __init__(self) -> None:
        self.__grouped_message = get_dataset("groupedmessage")
        groupedmessage_source = (self.__grouped_message.get_dataset_schemas().
                                 get_read_schema().get_data_source())

        self.__events = get_dataset("events")
        events_source = (self.__events.get_dataset_schemas().get_read_schema().
                         get_data_source())

        join_structure = JoinClause(
            left_node=TableJoinNode(
                table_name=groupedmessage_source.format_from(),
                columns=groupedmessage_source.get_columns(),
                mandatory_conditions=[
                    # TODO: This will be replaced as soon as expressions won't be strings
                    # thus we will be able to easily add an alias to a column in an
                    # expression.
                    (qualified_column("record_deleted",
                                      self.GROUPS_ALIAS), "=", 0)
                ],
                prewhere_candidates=[
                    qualified_column(col, self.GROUPS_ALIAS)
                    for col in groupedmessage_source.get_prewhere_candidates()
                ],
                alias=self.GROUPS_ALIAS,
            ),
            right_node=TableJoinNode(
                table_name=events_source.format_from(),
                columns=events_source.get_columns(),
                mandatory_conditions=[
                    (qualified_column("deleted", self.EVENTS_ALIAS), "=", 0)
                ],
                prewhere_candidates=[
                    qualified_column(col, self.EVENTS_ALIAS)
                    for col in events_source.get_prewhere_candidates()
                ],
                alias=self.EVENTS_ALIAS,
            ),
            mapping=[
                JoinCondition(
                    left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS,
                                                 column="project_id"),
                    right=JoinConditionExpression(
                        table_alias=self.EVENTS_ALIAS, column="project_id"),
                ),
                JoinCondition(
                    left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS,
                                                 column="id"),
                    right=JoinConditionExpression(
                        table_alias=self.EVENTS_ALIAS, column="group_id"),
                ),
            ],
            join_type=JoinType.LEFT,
        )

        schema = JoinedSchema(join_structure)
        dataset_schemas = DatasetSchemas(
            read_schema=schema,
            write_schema=None,
        )
        super().__init__(
            dataset_schemas=dataset_schemas,
            time_group_columns={"events.time": "events.timestamp"},
            time_parse_columns=[
                "events.timestamp",
                "events.received",
                "groups.last_seen",
                "groups.first_seen",
                "groups.active_at",
            ],
        )
Пример #12
0
    def __init__(self) -> None:
        self.__grouped_message = get_entity(EntityKey.GROUPEDMESSAGES)
        groupedmessage_source = (get_storage(
            StorageKey.GROUPEDMESSAGES).get_schema().get_data_source())

        self.__events = get_entity(EntityKey.EVENTS)
        events_source = get_storage(
            StorageKey.EVENTS).get_schema().get_data_source()

        join_structure = JoinClause(
            left_node=TableJoinNode(
                table_name=groupedmessage_source.format_from(),
                columns=groupedmessage_source.get_columns(),
                mandatory_conditions=[
                    binary_condition(
                        None,
                        ConditionFunctions.EQ,
                        Column(None, self.GROUPS_ALIAS, "record_deleted"),
                        Literal(None, 0),
                    ),
                ],
                prewhere_candidates=[
                    qualified_column(col, self.GROUPS_ALIAS)
                    for col in groupedmessage_source.get_prewhere_candidates()
                ],
                alias=self.GROUPS_ALIAS,
            ),
            right_node=TableJoinNode(
                table_name=events_source.format_from(),
                columns=events_source.get_columns(),
                mandatory_conditions=[
                    binary_condition(
                        None,
                        ConditionFunctions.EQ,
                        Column(None, self.EVENTS_ALIAS, "deleted"),
                        Literal(None, 0),
                    ),
                ],
                prewhere_candidates=[
                    qualified_column(col, self.EVENTS_ALIAS)
                    for col in events_source.get_prewhere_candidates()
                ],
                alias=self.EVENTS_ALIAS,
            ),
            mapping=[
                JoinCondition(
                    left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS,
                                                 column="project_id"),
                    right=JoinConditionExpression(
                        table_alias=self.EVENTS_ALIAS, column="project_id"),
                ),
                JoinCondition(
                    left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS,
                                                 column="id"),
                    right=JoinConditionExpression(
                        table_alias=self.EVENTS_ALIAS, column="group_id"),
                ),
            ],
            join_type=JoinType.LEFT,
        )

        schema = JoinedSchema(join_structure)
        storage = JoinedStorage(StorageSetKey.EVENTS, join_structure)
        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            abstract_column_set=schema.get_columns(),
            writable_storage=None,
        )
Пример #13
0
    def __tags_expr(
        self,
        parsed_col: ParsedNestedColumn,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ) -> str:
        """
        Return an expression that array-joins on tags to produce an output with one
        row per tag.

        It can also apply an arrayFilter in the arrayJoin if an equivalent condition
        is found in the query in order to reduce the size of the arrayJoin.
        """
        col, k_or_v = parsed_col.col_name.split("_", 1)
        nested_tags_only = state.get_config("nested_tags_only", 1)

        qualified_col = qualified_column(col, table_alias)
        # Generate parallel lists of keys and values to arrayJoin on
        if nested_tags_only:
            key_list = "{}.key".format(qualified_col)
            val_list = "{}.value".format(qualified_col)
        else:
            promoted = self.__promoted_columns[col]
            col_map = self.__column_tag_map[col]
            key_list = "arrayConcat([{}], {}.key)".format(
                ", ".join("'{}'".format(col_map.get(p, p)) for p in promoted),
                qualified_col,
            )
            val_list = "arrayConcat([{}], {}.value)".format(
                ", ".join(self.__string_col(p) for p in promoted),
                qualified_col)

        qualified_key = qualified_column("tags_key", table_alias)
        qualified_value = qualified_column("tags_value", table_alias)
        cols_used = query.get_all_referenced_columns() & set(
            [qualified_key, qualified_value])

        filter_tags = ",".join(
            [f"'{tag}'" for tag in self.__get_filter_tags(query)])
        if len(cols_used) == 2:
            # If we use both tags_key and tags_value in this query, arrayjoin
            # on (key, value) tag tuples.
            mapping = f"arrayMap((x,y) -> [x,y], {key_list}, {val_list})"
            if filter_tags:
                filtering = (
                    f"arrayFilter(pair -> pair[1] IN ({filter_tags}), {mapping})"
                )
            else:
                filtering = mapping

            expr = f"arrayJoin({filtering})"

            # put the all_tags expression in the alias cache so we can use the alias
            # to refer to it next time (eg. 'all_tags[1] AS tags_key'). instead of
            # expanding the whole tags expression again.
            expr = alias_expr(expr, "all_tags", parsing_context)
            return "({})[{}]".format(expr, 1 if k_or_v == "key" else 2)
        else:
            # If we are only ever going to use one of tags_key or tags_value, don't
            # bother creating the k/v tuples to arrayJoin on, or the all_tags alias
            # to re-use as we won't need it.
            if filter_tags:
                return (
                    f"arrayJoin(arrayFilter(tag -> tag IN ({filter_tags}), {key_list}))"
                )
            else:
                return f"arrayJoin({key_list if k_or_v == 'key' else val_list})"
Пример #14
0
    def __init__(self) -> None:
        self.__grouped_message = get_dataset("groupedmessage")
        groupedmessage_source = self.__grouped_message \
            .get_dataset_schemas() \
            .get_read_schema() \
            .get_data_source()

        self.__events = get_dataset("events")
        events_source = self.__events \
            .get_dataset_schemas() \
            .get_read_schema() \
            .get_data_source()

        join_structure = JoinClause(
            left_node=TableJoinNode(
                table_name=groupedmessage_source.format_from(),
                columns=groupedmessage_source.get_columns(),
                mandatory_conditions=[
                    # TODO: This will be replaced as soon as expressions won't be strings
                    # thus we will be able to easily add an alias to a column in an
                    # expression.
                    (qualified_column('record_deleted',
                                      self.GROUPS_ALIAS), '=', 0)
                ],
                alias=self.GROUPS_ALIAS,
            ),
            right_node=TableJoinNode(
                table_name=events_source.format_from(),
                columns=events_source.get_columns(),
                mandatory_conditions=[
                    (qualified_column('deleted', self.EVENTS_ALIAS), '=', 0)
                ],
                alias=self.EVENTS_ALIAS,
            ),
            mapping=[
                JoinCondition(
                    left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS,
                                                 column="project_id"),
                    right=JoinConditionExpression(
                        table_alias=self.EVENTS_ALIAS, column="project_id"),
                ),
                JoinCondition(
                    left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS,
                                                 column="id"),
                    right=JoinConditionExpression(
                        table_alias=self.EVENTS_ALIAS, column="group_id"),
                ),
            ],
            join_type=JoinType.LEFT,
        )

        schema = JoinedSchema(join_structure)
        dataset_schemas = DatasetSchemas(
            read_schema=schema,
            write_schema=None,
        )
        super().__init__(
            dataset_schemas=dataset_schemas,
            time_group_columns={
                'events.time': 'events.timestamp',
            },
            time_parse_columns=['events.timestamp'],
        )
Пример #15
0
 def default_conditions(self, table_alias: str = "") -> Sequence[Condition]:
     return [
         (qualified_column('deleted', table_alias), '=', 0),
     ]