示例#1
0
def detect_table(query: Query, events_only_columns: ColumnSet,
                 transactions_only_columns: ColumnSet) -> str:
    """
    Given a query, we attempt to guess whether it is better to fetch data from the
    "events" or "transactions" storage. This is going to be wrong in some cases.
    """
    # First check for a top level condition that matches either type = transaction
    # type != transaction.
    conditions = query.get_conditions()
    if conditions:
        for idx, condition in enumerate(conditions):
            if is_condition(condition):
                if tuple(condition) == ("type", "=", "error"):
                    return EVENTS
                elif tuple(condition) == ("type", "=", "transaction"):
                    return TRANSACTIONS

    # Check for any conditions that reference a table specific field
    condition_columns = query.get_columns_referenced_in_conditions()
    if any(events_only_columns.get(col) for col in condition_columns):
        return EVENTS
    if any(transactions_only_columns.get(col) for col in condition_columns):
        return TRANSACTIONS

    # Check for any other references to a table specific field
    all_referenced_columns = query.get_all_referenced_columns()
    if any(events_only_columns.get(col) for col in all_referenced_columns):
        return EVENTS
    if any(
            transactions_only_columns.get(col)
            for col in all_referenced_columns):
        return TRANSACTIONS

    # Use events by default
    return EVENTS
示例#2
0
def track_bad_query(
    query: Query,
    selected_entity: EntityKey,
    events_only_columns: ColumnSet,
    transactions_only_columns: ColumnSet,
) -> None:
    event_columns = set()
    transaction_columns = set()
    for col in query.get_all_ast_referenced_columns():
        if events_only_columns.get(col.column_name):
            event_columns.add(col.column_name)
        elif transactions_only_columns.get(col.column_name):
            transaction_columns.add(col.column_name)

    for subscript in query.get_all_ast_referenced_subscripts():
        schema_col_name = subscript_key_column_name(subscript)
        if events_only_columns.get(schema_col_name):
            event_columns.add(schema_col_name)
        if transactions_only_columns.get(schema_col_name):
            transaction_columns.add(schema_col_name)

    event_mismatch = event_columns and selected_entity == TRANSACTIONS
    transaction_mismatch = transaction_columns and selected_entity in [
        EVENTS,
        EVENTS_AND_TRANSACTIONS,
    ]

    if event_mismatch or transaction_mismatch:
        missing_columns = ",".join(
            sorted(event_columns if event_mismatch else transaction_columns))
        selected_entity_str = (str(selected_entity.value) if isinstance(
            selected_entity, EntityKey) else selected_entity)

        metrics.increment(
            "query.impossible",
            tags={
                "selected_table": selected_entity_str,
                "missing_columns": missing_columns,
            },
        )

    if selected_entity == EVENTS_AND_TRANSACTIONS and (event_columns
                                                       or transaction_columns):
        # Not possible in future with merge table
        missing_events_columns = ",".join(sorted(event_columns))
        missing_transactions_columns = ",".join(sorted(transaction_columns))
        metrics.increment(
            "query.impossible-merge-table",
            tags={
                "missing_events_columns": missing_events_columns,
                "missing_transactions_columns": missing_transactions_columns,
            },
        )

    else:
        metrics.increment("query.success")
示例#3
0
    def validate(self, expression: Expression, schema: ColumnSet) -> None:
        match = COLUMN_PATTERN.match(expression)
        if match is None:
            return

        column_name = match.string("column_name")
        column = schema.get(column_name)
        if column is None:
            # TODO: We cannot raise exceptions if the column is not present
            # on the schema just yet because the current logical schemas are
            # sadly not complete. Fix them and then raise an exception in this
            # case.
            return

        nullable = column.type.has_modifier(Nullable)

        if not isinstance(column.type, tuple(self.__valid_types)) or (
            nullable and not self.__allow_nullable
        ):
            raise InvalidFunctionCall(
                (
                    f"Illegal type {'Nullable ' if nullable else ''}{str(column.type)} "
                    f"of argument `{column_name}`. Required types {self.__valid_types}"
                )
            )
示例#4
0
def match_query_to_entity(
    query: Query,
    events_only_columns: ColumnSet,
    transactions_only_columns: ColumnSet,
) -> EntityKey:
    # First check for a top level condition on the event type
    condition = query.get_condition_from_ast()
    event_types = set()
    if condition:
        top_level_condition = get_first_level_and_conditions(condition)

        for cond in top_level_condition:
            result = EVENT_CONDITION.match(cond)
            if not result:
                continue

            event_type_param = result.expression("event_type")

            if isinstance(event_type_param, Column):
                event_type = event_type_param.column_name
            elif isinstance(event_type_param, Literal):
                event_type = str(event_type_param.value)
            if result:
                if result.string("function") == ConditionFunctions.EQ:
                    event_types.add(event_type)
                elif result.string("function") == ConditionFunctions.NEQ:
                    if event_type == "transaction":
                        return EVENTS

    if len(event_types) == 1 and "transaction" in event_types:
        return TRANSACTIONS

    if len(event_types) > 0 and "transaction" not in event_types:
        return EVENTS

    # If we cannot clearly pick an entity from the top level conditions, then
    # inspect the columns requested to infer a selection.
    has_event_columns = False
    has_transaction_columns = False
    for col in query.get_all_ast_referenced_columns():
        if events_only_columns.get(col.column_name):
            has_event_columns = True
        elif transactions_only_columns.get(col.column_name):
            has_transaction_columns = True

    for subscript in query.get_all_ast_referenced_subscripts():
        # Subscriptable references will not be properly recognized above
        # through get_all_ast_referenced_columns since the columns that
        # method will find will look like `tags` or `measurements`, while
        # the column sets contains `tags.key` and `tags.value`.
        schema_col_name = subscript_key_column_name(subscript)
        if events_only_columns.get(schema_col_name):
            has_event_columns = True
        if transactions_only_columns.get(schema_col_name):
            has_transaction_columns = True

    # Check for isHandled/notHandled
    if has_event_columns is False:
        for expr in query.get_all_expressions():
            match = EVENT_FUNCTIONS.match(expr)
            if match:
                has_event_columns = True

    # Check for apdex or failure rate
    if has_transaction_columns is False:
        for expr in query.get_all_expressions():
            match = TRANSACTION_FUNCTIONS.match(expr)
            if match:
                has_transaction_columns = True

    if has_event_columns and has_transaction_columns:
        # Impossible query, use the merge table
        return EVENTS_AND_TRANSACTIONS
    elif has_event_columns:
        return EVENTS
    elif has_transaction_columns:
        return TRANSACTIONS
    else:
        return EVENTS_AND_TRANSACTIONS
示例#5
0
class DiscoverEntity(Entity):
    """
    Entity for the Discover product that maps the columns of Events and
    Transactions into a standard format and sends a query to one of the 2 tables
    depending on the conditions detected.

    It is based on two storages. One for events and one for transactions.
    """

    def __init__(self) -> None:
        self.__common_columns = ColumnSet(
            [
                ("event_id", FixedString(32)),
                ("project_id", UInt(64)),
                ("type", Nullable(String())),
                ("timestamp", DateTime()),
                ("platform", Nullable(String())),
                ("environment", Nullable(String())),
                ("release", Nullable(String())),
                ("dist", Nullable(String())),
                ("user", Nullable(String())),
                ("transaction", Nullable(String())),
                ("message", Nullable(String())),
                ("title", Nullable(String())),
                # User
                ("user_id", Nullable(String())),
                ("username", Nullable(String())),
                ("email", Nullable(String())),
                ("ip_address", Nullable(String())),
                # SDK
                ("sdk_name", Nullable(String())),
                ("sdk_version", Nullable(String())),
                # geo location context
                ("geo_country_code", Nullable(String())),
                ("geo_region", Nullable(String())),
                ("geo_city", Nullable(String())),
                ("http_method", Nullable(String())),
                ("http_referer", Nullable(String())),
                # Other tags and context
                ("tags", Nested([("key", String()), ("value", String())])),
                ("contexts", Nested([("key", String()), ("value", String())])),
            ]
        )

        self.__events_columns = ColumnSet(
            [
                ("group_id", Nullable(UInt(64))),
                ("primary_hash", Nullable(FixedString(32))),
                # Promoted tags
                ("level", Nullable(String())),
                ("logger", Nullable(String())),
                ("server_name", Nullable(String())),
                ("site", Nullable(String())),
                ("url", Nullable(String())),
                ("search_message", Nullable(String())),
                ("location", Nullable(String())),
                ("culprit", Nullable(String())),
                ("received", Nullable(DateTime())),
                ("sdk_integrations", Nullable(Array(String()))),
                ("version", Nullable(String())),
                # exception interface
                (
                    "exception_stacks",
                    Nested(
                        [
                            ("type", Nullable(String())),
                            ("value", Nullable(String())),
                            ("mechanism_type", Nullable(String())),
                            ("mechanism_handled", Nullable(UInt(8))),
                        ]
                    ),
                ),
                (
                    "exception_frames",
                    Nested(
                        [
                            ("abs_path", Nullable(String())),
                            ("filename", Nullable(String())),
                            ("package", Nullable(String())),
                            ("module", Nullable(String())),
                            ("function", Nullable(String())),
                            ("in_app", Nullable(UInt(8))),
                            ("colno", Nullable(UInt(32))),
                            ("lineno", Nullable(UInt(32))),
                            ("stack_level", UInt(16)),
                        ]
                    ),
                ),
                ("modules", Nested([("name", String()), ("version", String())])),
            ]
        )

        self.__transactions_columns = ColumnSet(
            [
                ("trace_id", Nullable(UUID())),
                ("span_id", Nullable(UInt(64))),
                ("transaction_hash", Nullable(UInt(64))),
                ("transaction_op", Nullable(String())),
                ("transaction_status", Nullable(UInt(8))),
                ("duration", Nullable(UInt(32))),
                (
                    "measurements",
                    Nested([("key", LowCardinality(String())), ("value", Float(64))]),
                ),
            ]
        )

        events_storage = get_storage(StorageKey.EVENTS)
        events_ro_storage = get_storage(StorageKey.EVENTS_RO)
        transactions_storage = get_storage(StorageKey.TRANSACTIONS)

        self.__time_group_columns: Mapping[str, str] = {}
        self.__time_parse_columns = ("timestamp",)

        super().__init__(
            storages=[events_storage, transactions_storage],
            query_plan_builder=SelectedStorageQueryPlanBuilder(
                selector=DiscoverQueryStorageSelector(
                    events_table=events_storage,
                    events_ro_table=events_ro_storage,
                    abstract_events_columns=self.__events_columns,
                    transactions_table=transactions_storage,
                    abstract_transactions_columns=self.__transactions_columns,
                ),
            ),
            abstract_column_set=(
                self.__common_columns
                + self.__events_columns
                + self.__transactions_columns
            ),
            writable_storage=None,
        )

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        columnset = self.get_data_model()
        return [
            TagsExpanderProcessor(),
            BasicFunctionsProcessor(),
            # Apdex and Impact seem very good candidates for
            # being defined by the Transaction entity when it will
            # exist, so it would run before Storage selection.
            apdex_processor(columnset),
            failure_rate_processor(columnset),
            HandledFunctionsProcessor("exception_stacks.mechanism_handled", columnset),
            TimeSeriesColumnProcessor({"time": "timestamp"}),
        ]

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project": ProjectExtension(project_column="project_id"),
            "timeseries": TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="timestamp",
            ),
        }

    def column_expr(
        self,
        column_name: str,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ) -> Union[None, Any]:
        detected_entity = detect_table(
            query, self.__events_columns, self.__transactions_columns, False,
        )

        if detected_entity == TRANSACTIONS:
            if column_name == "group_id":
                # TODO: We return 0 here instead of NULL so conditions like group_id
                # in (1, 2, 3) will work, since Clickhouse won't run a query like:
                # SELECT (NULL AS group_id) FROM transactions WHERE group_id IN (1, 2, 3)
                # When we have the query AST, we should solve this by transforming the
                # nonsensical conditions instead.
                return "0"
            if self.__events_columns.get(column_name):
                return "NULL"
        else:
            if column_name == "release":
                column_name = "tags[sentry:release]"
            if column_name == "dist":
                column_name = "tags[sentry:dist]"
            if column_name == "user":
                column_name = "tags[sentry:user]"
            if self.__transactions_columns.get(column_name):
                return "NULL"

        return get_entity(detected_entity).column_expr(
            column_name, query, parsing_context
        )

    # TODO: This needs to burned with fire, for so many reasons.
    # It's here now to reduce the scope of the initial entity changes
    # but can be moved to a processor if not removed entirely.
    def process_condition(
        self, condition: Tuple[str, str, Any]
    ) -> Tuple[str, str, Any]:
        lhs, op, lit = condition
        if (
            lhs in self.__time_parse_columns
            and op in (">", "<", ">=", "<=", "=", "!=")
            and isinstance(lit, str)
        ):
            lit = parse_datetime(lit)
        return lhs, op, lit
示例#6
0
def detect_table(
    query: Query,
    events_only_columns: ColumnSet,
    transactions_only_columns: ColumnSet,
    track_bad_queries: bool,
) -> EntityKey:
    """
    Given a query, we attempt to guess whether it is better to fetch data from the
    "events", "transactions" or future merged storage.

    The merged storage resolves to the events storage until errors and transactions
    are split into separate physical tables.
    """
    selected_table = match_query_to_table(
        query, events_only_columns, transactions_only_columns
    )

    if track_bad_queries:
        event_columns = set()
        transaction_columns = set()
        for col in query.get_all_ast_referenced_columns():
            if events_only_columns.get(col.column_name):
                event_columns.add(col.column_name)
            elif transactions_only_columns.get(col.column_name):
                transaction_columns.add(col.column_name)

        for subscript in query.get_all_ast_referenced_subscripts():
            schema_col_name = subscript_key_column_name(subscript)
            if events_only_columns.get(schema_col_name):
                event_columns.add(schema_col_name)
            if transactions_only_columns.get(schema_col_name):
                transaction_columns.add(schema_col_name)

        event_mismatch = event_columns and selected_table == TRANSACTIONS
        transaction_mismatch = transaction_columns and selected_table in [
            EVENTS,
            EVENTS_AND_TRANSACTIONS,
        ]

        if event_mismatch or transaction_mismatch:
            missing_columns = ",".join(
                sorted(event_columns if event_mismatch else transaction_columns)
            )
            metrics.increment(
                "query.impossible",
                tags={
                    "selected_table": (
                        str(selected_table.value)
                        if isinstance(selected_table, EntityKey)
                        else selected_table
                    ),
                    "missing_columns": missing_columns,
                },
            )
            logger.warning("Discover generated impossible query", exc_info=True)

        if selected_table == EVENTS_AND_TRANSACTIONS and (
            event_columns or transaction_columns
        ):
            # Not possible in future with merge table
            metrics.increment(
                "query.impossible-merge-table",
                tags={
                    "missing_events_columns": ",".join(sorted(event_columns)),
                    "missing_transactions_columns": ",".join(
                        sorted(transaction_columns)
                    ),
                },
            )

        else:
            metrics.increment("query.success")

    # Default for events and transactions is events
    final_table = (
        EntityKey.EVENTS if selected_table != TRANSACTIONS else EntityKey.TRANSACTIONS
    )
    return final_table
示例#7
0
class DiscoverDataset(TimeSeriesDataset):
    """
    Dataset for the Discover product that maps the columns of Events and
    Transactions into a standard format and sends a query to one of the 2 tables
    depending on the conditions detected.

    It is based on two storages. One for events and one for transactions.
    """
    def __init__(self) -> None:
        self.__common_columns = ColumnSet([
            ("event_id", FixedString(32)),
            ("project_id", UInt(64)),
            ("type", Nullable(String())),
            ("timestamp", DateTime()),
            ("platform", Nullable(String())),
            ("environment", Nullable(String())),
            ("release", Nullable(String())),
            ("dist", Nullable(String())),
            ("user", Nullable(String())),
            ("transaction", Nullable(String())),
            ("message", Nullable(String())),
            ("title", Nullable(String())),
            # User
            ("user_id", Nullable(String())),
            ("username", Nullable(String())),
            ("email", Nullable(String())),
            ("ip_address", Nullable(String())),
            # SDK
            ("sdk_name", Nullable(String())),
            ("sdk_version", Nullable(String())),
            # geo location context
            ("geo_country_code", Nullable(String())),
            ("geo_region", Nullable(String())),
            ("geo_city", Nullable(String())),
            # Other tags and context
            ("tags", Nested([("key", String()), ("value", String())])),
            ("contexts", Nested([("key", String()), ("value", String())])),
        ])

        self.__events_columns = ColumnSet([
            ("group_id", Nullable(UInt(64))),
            ("primary_hash", Nullable(FixedString(32))),
            # Promoted tags
            ("level", Nullable(String())),
            ("logger", Nullable(String())),
            ("server_name", Nullable(String())),
            ("site", Nullable(String())),
            ("url", Nullable(String())),
            ("search_message", Nullable(String())),
            ("location", Nullable(String())),
            ("culprit", Nullable(String())),
            ("received", Nullable(DateTime())),
            ("sdk_integrations", Nullable(Array(String()))),
            ("version", Nullable(String())),
            ("http_method", Nullable(String())),
            ("http_referer", Nullable(String())),
            # exception interface
            (
                "exception_stacks",
                Nested([
                    ("type", Nullable(String())),
                    ("value", Nullable(String())),
                    ("mechanism_type", Nullable(String())),
                    ("mechanism_handled", Nullable(UInt(8))),
                ]),
            ),
            (
                "exception_frames",
                Nested([
                    ("abs_path", Nullable(String())),
                    ("filename", Nullable(String())),
                    ("package", Nullable(String())),
                    ("module", Nullable(String())),
                    ("function", Nullable(String())),
                    ("in_app", Nullable(UInt(8))),
                    ("colno", Nullable(UInt(32))),
                    ("lineno", Nullable(UInt(32))),
                    ("stack_level", UInt(16)),
                ]),
            ),
            ("modules", Nested([("name", String()), ("version", String())])),
        ])

        self.__transactions_columns = ColumnSet([
            ("trace_id", Nullable(UUID())),
            ("span_id", Nullable(UInt(64))),
            ("transaction_hash", Nullable(UInt(64))),
            ("transaction_op", Nullable(String())),
            ("transaction_status", Nullable(UInt(8))),
            ("duration", Nullable(UInt(32))),
        ])

        events_storage = get_storage("events")
        transactions_storage = get_storage("transactions")

        super().__init__(
            storages=[events_storage, transactions_storage],
            query_plan_builder=SelectedStorageQueryPlanBuilder(
                selector=DiscoverQueryStorageSelector(
                    events_table=events_storage,
                    abstract_events_columns=self.__events_columns,
                    transactions_table=transactions_storage,
                    abstract_transactions_columns=self.__transactions_columns,
                ), ),
            abstract_column_set=(self.__common_columns +
                                 self.__events_columns +
                                 self.__transactions_columns),
            writable_storage=None,
            time_group_columns={},
            time_parse_columns=["timestamp"],
        )

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        return [
            BasicFunctionsProcessor(),
            # Apdex and Impact seem very good candidates for
            # being defined by the Transaction entity when it will
            # exist, so it would run before Storage selection.
            ApdexProcessor(),
            ImpactProcessor(),
            TimeSeriesColumnProcessor({}),
        ]

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project":
            ProjectExtension(processor=ProjectWithGroupsProcessor(
                project_column="project_id",
                replacer_state_name=None,
            )),
            "timeseries":
            TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="timestamp",
            ),
        }

    def column_expr(
        self,
        column_name,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ):
        detected_dataset = detect_table(query, self.__events_columns,
                                        self.__transactions_columns)

        if detected_dataset == TRANSACTIONS:
            if column_name == "time":
                return self.time_expr("finish_ts", query.get_granularity(),
                                      table_alias)
            if column_name == "type":
                return "'transaction'"
            if column_name == "timestamp":
                return "finish_ts"
            if column_name == "username":
                return "user_name"
            if column_name == "email":
                return "user_email"
            if column_name == "transaction":
                return "transaction_name"
            if column_name == "message":
                return "transaction_name"
            if column_name == "title":
                return "transaction_name"
            if column_name == "group_id":
                # TODO: We return 0 here instead of NULL so conditions like group_id
                # in (1, 2, 3) will work, since Clickhouse won't run a query like:
                # SELECT (NULL AS group_id) FROM transactions WHERE group_id IN (1, 2, 3)
                # When we have the query AST, we should solve this by transforming the
                # nonsensical conditions instead.
                return "0"
            if column_name == "geo_country_code":
                column_name = "contexts[geo.country_code]"
            if column_name == "geo_region":
                column_name = "contexts[geo.region]"
            if column_name == "geo_city":
                column_name = "contexts[geo.city]"
            if self.__events_columns.get(column_name):
                return "NULL"
        else:
            if column_name == "time":
                return self.time_expr("timestamp", query.get_granularity(),
                                      table_alias)
            if column_name == "release":
                column_name = "tags[sentry:release]"
            if column_name == "dist":
                column_name = "tags[sentry:dist]"
            if column_name == "user":
                column_name = "tags[sentry:user]"
            if self.__transactions_columns.get(column_name):
                return "NULL"

        return get_dataset(detected_dataset).column_expr(
            column_name, query, parsing_context)