def detect_table(query: Query, events_only_columns: ColumnSet, transactions_only_columns: ColumnSet) -> str: """ Given a query, we attempt to guess whether it is better to fetch data from the "events" or "transactions" storage. This is going to be wrong in some cases. """ # First check for a top level condition that matches either type = transaction # type != transaction. conditions = query.get_conditions() if conditions: for idx, condition in enumerate(conditions): if is_condition(condition): if tuple(condition) == ("type", "=", "error"): return EVENTS elif tuple(condition) == ("type", "=", "transaction"): return TRANSACTIONS # Check for any conditions that reference a table specific field condition_columns = query.get_columns_referenced_in_conditions() if any(events_only_columns.get(col) for col in condition_columns): return EVENTS if any(transactions_only_columns.get(col) for col in condition_columns): return TRANSACTIONS # Check for any other references to a table specific field all_referenced_columns = query.get_all_referenced_columns() if any(events_only_columns.get(col) for col in all_referenced_columns): return EVENTS if any( transactions_only_columns.get(col) for col in all_referenced_columns): return TRANSACTIONS # Use events by default return EVENTS
def track_bad_query( query: Query, selected_entity: EntityKey, events_only_columns: ColumnSet, transactions_only_columns: ColumnSet, ) -> None: event_columns = set() transaction_columns = set() for col in query.get_all_ast_referenced_columns(): if events_only_columns.get(col.column_name): event_columns.add(col.column_name) elif transactions_only_columns.get(col.column_name): transaction_columns.add(col.column_name) for subscript in query.get_all_ast_referenced_subscripts(): schema_col_name = subscript_key_column_name(subscript) if events_only_columns.get(schema_col_name): event_columns.add(schema_col_name) if transactions_only_columns.get(schema_col_name): transaction_columns.add(schema_col_name) event_mismatch = event_columns and selected_entity == TRANSACTIONS transaction_mismatch = transaction_columns and selected_entity in [ EVENTS, EVENTS_AND_TRANSACTIONS, ] if event_mismatch or transaction_mismatch: missing_columns = ",".join( sorted(event_columns if event_mismatch else transaction_columns)) selected_entity_str = (str(selected_entity.value) if isinstance( selected_entity, EntityKey) else selected_entity) metrics.increment( "query.impossible", tags={ "selected_table": selected_entity_str, "missing_columns": missing_columns, }, ) if selected_entity == EVENTS_AND_TRANSACTIONS and (event_columns or transaction_columns): # Not possible in future with merge table missing_events_columns = ",".join(sorted(event_columns)) missing_transactions_columns = ",".join(sorted(transaction_columns)) metrics.increment( "query.impossible-merge-table", tags={ "missing_events_columns": missing_events_columns, "missing_transactions_columns": missing_transactions_columns, }, ) else: metrics.increment("query.success")
def validate(self, expression: Expression, schema: ColumnSet) -> None: match = COLUMN_PATTERN.match(expression) if match is None: return column_name = match.string("column_name") column = schema.get(column_name) if column is None: # TODO: We cannot raise exceptions if the column is not present # on the schema just yet because the current logical schemas are # sadly not complete. Fix them and then raise an exception in this # case. return nullable = column.type.has_modifier(Nullable) if not isinstance(column.type, tuple(self.__valid_types)) or ( nullable and not self.__allow_nullable ): raise InvalidFunctionCall( ( f"Illegal type {'Nullable ' if nullable else ''}{str(column.type)} " f"of argument `{column_name}`. Required types {self.__valid_types}" ) )
def match_query_to_entity( query: Query, events_only_columns: ColumnSet, transactions_only_columns: ColumnSet, ) -> EntityKey: # First check for a top level condition on the event type condition = query.get_condition_from_ast() event_types = set() if condition: top_level_condition = get_first_level_and_conditions(condition) for cond in top_level_condition: result = EVENT_CONDITION.match(cond) if not result: continue event_type_param = result.expression("event_type") if isinstance(event_type_param, Column): event_type = event_type_param.column_name elif isinstance(event_type_param, Literal): event_type = str(event_type_param.value) if result: if result.string("function") == ConditionFunctions.EQ: event_types.add(event_type) elif result.string("function") == ConditionFunctions.NEQ: if event_type == "transaction": return EVENTS if len(event_types) == 1 and "transaction" in event_types: return TRANSACTIONS if len(event_types) > 0 and "transaction" not in event_types: return EVENTS # If we cannot clearly pick an entity from the top level conditions, then # inspect the columns requested to infer a selection. has_event_columns = False has_transaction_columns = False for col in query.get_all_ast_referenced_columns(): if events_only_columns.get(col.column_name): has_event_columns = True elif transactions_only_columns.get(col.column_name): has_transaction_columns = True for subscript in query.get_all_ast_referenced_subscripts(): # Subscriptable references will not be properly recognized above # through get_all_ast_referenced_columns since the columns that # method will find will look like `tags` or `measurements`, while # the column sets contains `tags.key` and `tags.value`. schema_col_name = subscript_key_column_name(subscript) if events_only_columns.get(schema_col_name): has_event_columns = True if transactions_only_columns.get(schema_col_name): has_transaction_columns = True # Check for isHandled/notHandled if has_event_columns is False: for expr in query.get_all_expressions(): match = EVENT_FUNCTIONS.match(expr) if match: has_event_columns = True # Check for apdex or failure rate if has_transaction_columns is False: for expr in query.get_all_expressions(): match = TRANSACTION_FUNCTIONS.match(expr) if match: has_transaction_columns = True if has_event_columns and has_transaction_columns: # Impossible query, use the merge table return EVENTS_AND_TRANSACTIONS elif has_event_columns: return EVENTS elif has_transaction_columns: return TRANSACTIONS else: return EVENTS_AND_TRANSACTIONS
class DiscoverEntity(Entity): """ Entity for the Discover product that maps the columns of Events and Transactions into a standard format and sends a query to one of the 2 tables depending on the conditions detected. It is based on two storages. One for events and one for transactions. """ def __init__(self) -> None: self.__common_columns = ColumnSet( [ ("event_id", FixedString(32)), ("project_id", UInt(64)), ("type", Nullable(String())), ("timestamp", DateTime()), ("platform", Nullable(String())), ("environment", Nullable(String())), ("release", Nullable(String())), ("dist", Nullable(String())), ("user", Nullable(String())), ("transaction", Nullable(String())), ("message", Nullable(String())), ("title", Nullable(String())), # User ("user_id", Nullable(String())), ("username", Nullable(String())), ("email", Nullable(String())), ("ip_address", Nullable(String())), # SDK ("sdk_name", Nullable(String())), ("sdk_version", Nullable(String())), # geo location context ("geo_country_code", Nullable(String())), ("geo_region", Nullable(String())), ("geo_city", Nullable(String())), ("http_method", Nullable(String())), ("http_referer", Nullable(String())), # Other tags and context ("tags", Nested([("key", String()), ("value", String())])), ("contexts", Nested([("key", String()), ("value", String())])), ] ) self.__events_columns = ColumnSet( [ ("group_id", Nullable(UInt(64))), ("primary_hash", Nullable(FixedString(32))), # Promoted tags ("level", Nullable(String())), ("logger", Nullable(String())), ("server_name", Nullable(String())), ("site", Nullable(String())), ("url", Nullable(String())), ("search_message", Nullable(String())), ("location", Nullable(String())), ("culprit", Nullable(String())), ("received", Nullable(DateTime())), ("sdk_integrations", Nullable(Array(String()))), ("version", Nullable(String())), # exception interface ( "exception_stacks", Nested( [ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ] ), ), ( "exception_frames", Nested( [ ("abs_path", Nullable(String())), ("filename", Nullable(String())), ("package", Nullable(String())), ("module", Nullable(String())), ("function", Nullable(String())), ("in_app", Nullable(UInt(8))), ("colno", Nullable(UInt(32))), ("lineno", Nullable(UInt(32))), ("stack_level", UInt(16)), ] ), ), ("modules", Nested([("name", String()), ("version", String())])), ] ) self.__transactions_columns = ColumnSet( [ ("trace_id", Nullable(UUID())), ("span_id", Nullable(UInt(64))), ("transaction_hash", Nullable(UInt(64))), ("transaction_op", Nullable(String())), ("transaction_status", Nullable(UInt(8))), ("duration", Nullable(UInt(32))), ( "measurements", Nested([("key", LowCardinality(String())), ("value", Float(64))]), ), ] ) events_storage = get_storage(StorageKey.EVENTS) events_ro_storage = get_storage(StorageKey.EVENTS_RO) transactions_storage = get_storage(StorageKey.TRANSACTIONS) self.__time_group_columns: Mapping[str, str] = {} self.__time_parse_columns = ("timestamp",) super().__init__( storages=[events_storage, transactions_storage], query_plan_builder=SelectedStorageQueryPlanBuilder( selector=DiscoverQueryStorageSelector( events_table=events_storage, events_ro_table=events_ro_storage, abstract_events_columns=self.__events_columns, transactions_table=transactions_storage, abstract_transactions_columns=self.__transactions_columns, ), ), abstract_column_set=( self.__common_columns + self.__events_columns + self.__transactions_columns ), writable_storage=None, ) def get_query_processors(self) -> Sequence[QueryProcessor]: columnset = self.get_data_model() return [ TagsExpanderProcessor(), BasicFunctionsProcessor(), # Apdex and Impact seem very good candidates for # being defined by the Transaction entity when it will # exist, so it would run before Storage selection. apdex_processor(columnset), failure_rate_processor(columnset), HandledFunctionsProcessor("exception_stacks.mechanism_handled", columnset), TimeSeriesColumnProcessor({"time": "timestamp"}), ] def get_extensions(self) -> Mapping[str, QueryExtension]: return { "project": ProjectExtension(project_column="project_id"), "timeseries": TimeSeriesExtension( default_granularity=3600, default_window=timedelta(days=5), timestamp_column="timestamp", ), } def column_expr( self, column_name: str, query: Query, parsing_context: ParsingContext, table_alias: str = "", ) -> Union[None, Any]: detected_entity = detect_table( query, self.__events_columns, self.__transactions_columns, False, ) if detected_entity == TRANSACTIONS: if column_name == "group_id": # TODO: We return 0 here instead of NULL so conditions like group_id # in (1, 2, 3) will work, since Clickhouse won't run a query like: # SELECT (NULL AS group_id) FROM transactions WHERE group_id IN (1, 2, 3) # When we have the query AST, we should solve this by transforming the # nonsensical conditions instead. return "0" if self.__events_columns.get(column_name): return "NULL" else: if column_name == "release": column_name = "tags[sentry:release]" if column_name == "dist": column_name = "tags[sentry:dist]" if column_name == "user": column_name = "tags[sentry:user]" if self.__transactions_columns.get(column_name): return "NULL" return get_entity(detected_entity).column_expr( column_name, query, parsing_context ) # TODO: This needs to burned with fire, for so many reasons. # It's here now to reduce the scope of the initial entity changes # but can be moved to a processor if not removed entirely. def process_condition( self, condition: Tuple[str, str, Any] ) -> Tuple[str, str, Any]: lhs, op, lit = condition if ( lhs in self.__time_parse_columns and op in (">", "<", ">=", "<=", "=", "!=") and isinstance(lit, str) ): lit = parse_datetime(lit) return lhs, op, lit
def detect_table( query: Query, events_only_columns: ColumnSet, transactions_only_columns: ColumnSet, track_bad_queries: bool, ) -> EntityKey: """ Given a query, we attempt to guess whether it is better to fetch data from the "events", "transactions" or future merged storage. The merged storage resolves to the events storage until errors and transactions are split into separate physical tables. """ selected_table = match_query_to_table( query, events_only_columns, transactions_only_columns ) if track_bad_queries: event_columns = set() transaction_columns = set() for col in query.get_all_ast_referenced_columns(): if events_only_columns.get(col.column_name): event_columns.add(col.column_name) elif transactions_only_columns.get(col.column_name): transaction_columns.add(col.column_name) for subscript in query.get_all_ast_referenced_subscripts(): schema_col_name = subscript_key_column_name(subscript) if events_only_columns.get(schema_col_name): event_columns.add(schema_col_name) if transactions_only_columns.get(schema_col_name): transaction_columns.add(schema_col_name) event_mismatch = event_columns and selected_table == TRANSACTIONS transaction_mismatch = transaction_columns and selected_table in [ EVENTS, EVENTS_AND_TRANSACTIONS, ] if event_mismatch or transaction_mismatch: missing_columns = ",".join( sorted(event_columns if event_mismatch else transaction_columns) ) metrics.increment( "query.impossible", tags={ "selected_table": ( str(selected_table.value) if isinstance(selected_table, EntityKey) else selected_table ), "missing_columns": missing_columns, }, ) logger.warning("Discover generated impossible query", exc_info=True) if selected_table == EVENTS_AND_TRANSACTIONS and ( event_columns or transaction_columns ): # Not possible in future with merge table metrics.increment( "query.impossible-merge-table", tags={ "missing_events_columns": ",".join(sorted(event_columns)), "missing_transactions_columns": ",".join( sorted(transaction_columns) ), }, ) else: metrics.increment("query.success") # Default for events and transactions is events final_table = ( EntityKey.EVENTS if selected_table != TRANSACTIONS else EntityKey.TRANSACTIONS ) return final_table
class DiscoverDataset(TimeSeriesDataset): """ Dataset for the Discover product that maps the columns of Events and Transactions into a standard format and sends a query to one of the 2 tables depending on the conditions detected. It is based on two storages. One for events and one for transactions. """ def __init__(self) -> None: self.__common_columns = ColumnSet([ ("event_id", FixedString(32)), ("project_id", UInt(64)), ("type", Nullable(String())), ("timestamp", DateTime()), ("platform", Nullable(String())), ("environment", Nullable(String())), ("release", Nullable(String())), ("dist", Nullable(String())), ("user", Nullable(String())), ("transaction", Nullable(String())), ("message", Nullable(String())), ("title", Nullable(String())), # User ("user_id", Nullable(String())), ("username", Nullable(String())), ("email", Nullable(String())), ("ip_address", Nullable(String())), # SDK ("sdk_name", Nullable(String())), ("sdk_version", Nullable(String())), # geo location context ("geo_country_code", Nullable(String())), ("geo_region", Nullable(String())), ("geo_city", Nullable(String())), # Other tags and context ("tags", Nested([("key", String()), ("value", String())])), ("contexts", Nested([("key", String()), ("value", String())])), ]) self.__events_columns = ColumnSet([ ("group_id", Nullable(UInt(64))), ("primary_hash", Nullable(FixedString(32))), # Promoted tags ("level", Nullable(String())), ("logger", Nullable(String())), ("server_name", Nullable(String())), ("site", Nullable(String())), ("url", Nullable(String())), ("search_message", Nullable(String())), ("location", Nullable(String())), ("culprit", Nullable(String())), ("received", Nullable(DateTime())), ("sdk_integrations", Nullable(Array(String()))), ("version", Nullable(String())), ("http_method", Nullable(String())), ("http_referer", Nullable(String())), # exception interface ( "exception_stacks", Nested([ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ]), ), ( "exception_frames", Nested([ ("abs_path", Nullable(String())), ("filename", Nullable(String())), ("package", Nullable(String())), ("module", Nullable(String())), ("function", Nullable(String())), ("in_app", Nullable(UInt(8))), ("colno", Nullable(UInt(32))), ("lineno", Nullable(UInt(32))), ("stack_level", UInt(16)), ]), ), ("modules", Nested([("name", String()), ("version", String())])), ]) self.__transactions_columns = ColumnSet([ ("trace_id", Nullable(UUID())), ("span_id", Nullable(UInt(64))), ("transaction_hash", Nullable(UInt(64))), ("transaction_op", Nullable(String())), ("transaction_status", Nullable(UInt(8))), ("duration", Nullable(UInt(32))), ]) events_storage = get_storage("events") transactions_storage = get_storage("transactions") super().__init__( storages=[events_storage, transactions_storage], query_plan_builder=SelectedStorageQueryPlanBuilder( selector=DiscoverQueryStorageSelector( events_table=events_storage, abstract_events_columns=self.__events_columns, transactions_table=transactions_storage, abstract_transactions_columns=self.__transactions_columns, ), ), abstract_column_set=(self.__common_columns + self.__events_columns + self.__transactions_columns), writable_storage=None, time_group_columns={}, time_parse_columns=["timestamp"], ) def get_query_processors(self) -> Sequence[QueryProcessor]: return [ BasicFunctionsProcessor(), # Apdex and Impact seem very good candidates for # being defined by the Transaction entity when it will # exist, so it would run before Storage selection. ApdexProcessor(), ImpactProcessor(), TimeSeriesColumnProcessor({}), ] def get_extensions(self) -> Mapping[str, QueryExtension]: return { "project": ProjectExtension(processor=ProjectWithGroupsProcessor( project_column="project_id", replacer_state_name=None, )), "timeseries": TimeSeriesExtension( default_granularity=3600, default_window=timedelta(days=5), timestamp_column="timestamp", ), } def column_expr( self, column_name, query: Query, parsing_context: ParsingContext, table_alias: str = "", ): detected_dataset = detect_table(query, self.__events_columns, self.__transactions_columns) if detected_dataset == TRANSACTIONS: if column_name == "time": return self.time_expr("finish_ts", query.get_granularity(), table_alias) if column_name == "type": return "'transaction'" if column_name == "timestamp": return "finish_ts" if column_name == "username": return "user_name" if column_name == "email": return "user_email" if column_name == "transaction": return "transaction_name" if column_name == "message": return "transaction_name" if column_name == "title": return "transaction_name" if column_name == "group_id": # TODO: We return 0 here instead of NULL so conditions like group_id # in (1, 2, 3) will work, since Clickhouse won't run a query like: # SELECT (NULL AS group_id) FROM transactions WHERE group_id IN (1, 2, 3) # When we have the query AST, we should solve this by transforming the # nonsensical conditions instead. return "0" if column_name == "geo_country_code": column_name = "contexts[geo.country_code]" if column_name == "geo_region": column_name = "contexts[geo.region]" if column_name == "geo_city": column_name = "contexts[geo.city]" if self.__events_columns.get(column_name): return "NULL" else: if column_name == "time": return self.time_expr("timestamp", query.get_granularity(), table_alias) if column_name == "release": column_name = "tags[sentry:release]" if column_name == "dist": column_name = "tags[sentry:dist]" if column_name == "user": column_name = "tags[sentry:user]" if self.__transactions_columns.get(column_name): return "NULL" return get_dataset(detected_dataset).column_expr( column_name, query, parsing_context)