def test_get_entity_subscription_for_metrics_dataset_for_users( self) -> None: aggregate = "percentage(users_crashed, users) AS _crash_rate_alert_aggregate" entity_subscription = get_entity_subscription_for_dataset( dataset=QueryDatasets.METRICS, aggregate=aggregate, time_window=3600, extra_fields={"org_id": self.organization.id}, ) assert isinstance(entity_subscription, MetricsSetsEntitySubscription) assert entity_subscription.aggregate == aggregate groupby = [resolve_tag_key("session.status")] assert entity_subscription.get_entity_extra_params() == { "organization": self.organization.id, "groupby": groupby, "granularity": 10, } assert entity_subscription.entity_key == EntityKey.MetricsSets assert entity_subscription.time_col == ENTITY_TIME_COLUMNS[ EntityKey.MetricsSets] assert entity_subscription.dataset == QueryDatasets.METRICS session_status = resolve_tag_key("session.status") session_status_tag_values = resolve_many_weak(["crashed", "init"]) snuba_filter = entity_subscription.build_snuba_filter("", None, None) assert snuba_filter assert snuba_filter.aggregations == [["uniq(value)", None, "value"]] assert snuba_filter.conditions == [ ["metric_id", "=", resolve(SessionMetricKey.USER.value)], [session_status, "IN", session_status_tag_values], ] assert snuba_filter.groupby == groupby assert snuba_filter.rollup == entity_subscription.get_granularity()
def _flatten_data(org_id: int, data: _SnubaDataByMetric) -> _DataPoints: """Unite snuba data from multiple queries into a single key-value map for easier access""" data_points = {} # It greatly simplifies code if we just assume that these two tags exist: # TODO: Can we get away with that assumption? tag_key_release = resolve_tag_key("release") tag_key_environment = resolve_tag_key("environment") tag_key_session_status = resolve_tag_key("session.status") for metric_key, metric_data in data: for row in metric_data: raw_session_status = row.pop(tag_key_session_status, None) or None flat_key = _DataPointKey( metric_key=metric_key, raw_session_status=raw_session_status, release=row.pop(tag_key_release, None), environment=row.pop(tag_key_environment, None), bucketed_time=row.pop("bucketed_time", None), project_id=row.pop("project_id", None), ) # Percentile column expands into multiple "virtual" columns: if "percentiles" in row: # TODO: Use percentile enum percentiles = row.pop("percentiles") for i, percentile in enumerate( ["p50", "p75", "p90", "p95", "p99"]): percentile_key = replace(flat_key, column=percentile) data_points[percentile_key] = percentiles[i] # Check for special group-by-status columns for col in list(row.keys()): if col.startswith("sessions_"): # Map column back to metric key new_key = replace(flat_key, metric_key=MetricKey.SESSION, raw_session_status=col[9:]) data_points[new_key] = row.pop(col) or 0 elif col.startswith("users_"): # Map column back to metric key new_key = replace(flat_key, metric_key=MetricKey.USER, raw_session_status=col[6:]) data_points[new_key] = row.pop(col) or 0 # Remaining data are simple columns: for col in list(row.keys()): assert col in ("avg", "max", "value") data_points[replace(flat_key, column=col)] = row.pop(col) assert row == {} return data_points
def _translate_conditions(org_id: int, input_: Any) -> Any: if isinstance(input_, Column): # The only filterable tag keys are release and environment. assert input_.name in ("release", "environment") # It greatly simplifies code if we just assume that they exist. # Alternative would be: # * if tag key or value does not exist in AND-clause, return no data # * if tag key or value does not exist in OR-clause, remove condition return Column(resolve_tag_key(input_.name)) if isinstance(input_, str): # Assuming this is the right-hand side, we need to fetch a tag value. # It's OK if the tag value resolves to None, the snuba query will then # return no results, as is intended behavior return indexer.resolve(input_) if isinstance(input_, Function): return Function(function=input_.function, parameters=_translate_conditions( org_id, input_.parameters)) if isinstance(input_, Condition): return Condition( lhs=_translate_conditions(org_id, input_.lhs), op=input_.op, rhs=_translate_conditions(org_id, input_.rhs), ) if isinstance(input_, (int, float)): return input_ assert isinstance(input_, (tuple, list)), input_ return [_translate_conditions(org_id, item) for item in input_]
def test_simple_users_for_metrics(self): org_id = self.organization.id for tag in [ SessionMetricKey.USER.value, "session.status", "crashed", "init" ]: indexer.record(org_id, tag) entity_subscription = get_entity_subscription_for_dataset( dataset=QueryDatasets.METRICS, time_window=3600, aggregate= "percentage(users_crashed, users) AS _crash_rate_alert_aggregate", extra_fields={"org_id": org_id}, ) snuba_filter = build_snuba_filter( entity_subscription, query="", environment=None, ) session_status = resolve_tag_key("session.status") session_status_tag_values = resolve_many_weak(["crashed", "init"]) assert snuba_filter assert snuba_filter.aggregations == [["uniq(value)", None, "value"]] assert snuba_filter.conditions == [ ["metric_id", "=", resolve(SessionMetricKey.USER.value)], [session_status, "IN", session_status_tag_values], ] assert snuba_filter.groupby == [session_status]
def resolve_tags(input_: Any) -> Any: """Translate tags in snuba condition This assumes that all strings are either tag names or tag values, so do not pass Column("metric_id") or Column("project_id") into this function. """ if isinstance(input_, list): return [resolve_tags(item) for item in input_] if isinstance(input_, Function): if input_.function == "ifNull": # This was wrapped automatically by QueryBuilder, remove wrapper return resolve_tags(input_.parameters[0]) return Function( function=input_.function, parameters=input_.parameters and [resolve_tags(item) for item in input_.parameters], ) if isinstance(input_, Condition): return Condition(lhs=resolve_tags(input_.lhs), op=input_.op, rhs=resolve_tags(input_.rhs)) if isinstance(input_, BooleanCondition): return input_.__class__(conditions=[resolve_tags(item) for item in input_.conditions]) if isinstance(input_, Column): # HACK: Some tags already take the form "tags[...]" in discover, take that into account: if input_.subscriptable == "tags": name = input_.key else: name = input_.name return Column(name=resolve_tag_key(name)) if isinstance(input_, str): return resolve_weak(input_) return input_
def _build_groupby(self, query_definition: QueryDefinition) -> List[Column]: # ToDo ensure we cannot add any other cols than tags and groupBy as columns return [ Column(resolve_tag_key(field)) if field not in ALLOWED_GROUPBY_COLUMNS else Column(field) for field in query_definition.groupby ]
def test_query_and_environment_users_metrics(self): env = self.create_environment(self.project, name="development") org_id = self.organization.id for tag in [ SessionMetricKey.USER.value, "session.status", "environment", "development", "init", "crashed", "release", "[email protected]", ]: indexer.record(org_id, tag) entity_subscription = get_entity_subscription_for_dataset( dataset=QueryDatasets.METRICS, time_window=3600, aggregate= "percentage(users_crashed, users) AS _crash_rate_alert_aggregate", extra_fields={"org_id": org_id}, ) snuba_filter = build_snuba_filter( entity_subscription, query="release:[email protected]", environment=env, ) assert snuba_filter assert snuba_filter.aggregations == [["uniq(value)", None, "value"]] assert snuba_filter.groupby == [resolve_tag_key("session.status")] assert snuba_filter.conditions == [ ["metric_id", "=", resolve(SessionMetricKey.USER.value)], [ resolve_tag_key("session.status"), "IN", resolve_many_weak(["crashed", "init"]), ], [resolve_tag_key("environment"), "=", resolve_weak("development")], [resolve_tag_key("release"), "=", resolve_weak("[email protected]")], ]
def build_snuba_filter( self, query: str, environment: Optional[Environment], params: Optional[Mapping[str, Any]] = None, ) -> Filter: snuba_filter = get_filter(query, params=params) conditions = copy(snuba_filter.conditions) session_status_tag_values = resolve_many_weak(["crashed", "init"]) snuba_filter.update_with({ "aggregations": [[f"{self.aggregation_func}(value)", None, "value"]], "conditions": [ ["metric_id", "=", resolve(self.metric_key.value)], [self.session_status, "IN", session_status_tag_values], ], "groupby": self.get_query_groupby(), "rollup": self.get_granularity(), }) if environment: snuba_filter.conditions.append([ resolve_tag_key("environment"), "=", resolve_weak(environment.name) ]) if query and len(conditions) > 0: release_conditions = [ condition for condition in conditions if condition[0] == "release" ] for release_condition in release_conditions: snuba_filter.conditions.append([ resolve_tag_key(release_condition[0]), release_condition[1], resolve_weak(release_condition[2]), ]) return snuba_filter
def __init__(self, aggregate: str, time_window: int, extra_fields: Optional[_EntitySpecificParams] = None): super().__init__(aggregate, time_window, extra_fields) self.aggregate = aggregate if not extra_fields or "org_id" not in extra_fields: raise InvalidQuerySubscription( "org_id is a required param when " "building snuba filter for a metrics subscription") self.org_id = extra_fields["org_id"] self.session_status = resolve_tag_key("session.status") self.time_window = time_window
def translate_sessions_tag_keys_and_values( data: List[Dict[str, Any]], org_id: int, alias: Optional[str] = None) -> Tuple[int, int]: value_col_name = alias if alias else "value" try: translated_data: Dict[str, Any] = {} session_status = resolve_tag_key("session.status") for row in data: tag_value = reverse_resolve(row[session_status]) translated_data[tag_value] = row[value_col_name] total_session_count = translated_data.get("init", 0) crash_count = translated_data.get("crashed", 0) except MetricIndexNotFound: metrics.incr( "incidents.entity_subscription.metric_index_not_found") total_session_count = crash_count = 0 return total_session_count, crash_count
def _fetch_data_for_field( org_id: int, query: QueryDefinition, raw_field: SessionsQueryFunction, limit_state: _LimitState, columns_fetched: Set[SelectableExpression], # output param ) -> Tuple[_SnubaDataByMetric, MutableMapping[Tuple[ MetricKey, _VirtualColumnName], _OutputField]]: tag_key_session_status = resolve_tag_key("session.status") data: _SnubaDataByMetric = [] # Find the field that needs a specific column in a specific metric metric_to_output_field: MutableMapping[Tuple[MetricKey, _VirtualColumnName], _OutputField] = {} group_by_status = "session.status" in query.raw_groupby # We limit the number of groups returned, but because session status # groups in the response are actually composed of multiple groups in storage, # we need to make sure we get them all. For this, use conditional aggregates: def get_column_for_status(function_name: str, prefix: str, status: str) -> Function: return Function( f"{function_name}If", [ Column("value"), Function( "equals", [Column(tag_key_session_status), indexer.resolve(status)], ), ], alias=f"{prefix}_{status}", ) if "count_unique(user)" == raw_field: metric_id = indexer.resolve(MetricKey.USER.value) if metric_id is not None: if group_by_status: data.extend( _get_snuba_query_data( org_id, query, EntityKey.MetricsSets, MetricKey.USER, metric_id, [ # The order of these columns is important, because # the first column might get used in order by get_column_for_status("uniq", "users", "init"), get_column_for_status("uniq", "users", "abnormal"), get_column_for_status("uniq", "users", "crashed"), get_column_for_status("uniq", "users", "errored"), ], limit_state, )) else: data.extend( _get_snuba_query_data( org_id, query, EntityKey.MetricsSets, MetricKey.USER, metric_id, [Function("uniq", [Column("value")], "value")], limit_state, )) metric_to_output_field[(MetricKey.USER, "value")] = _UserField() if raw_field in _DURATION_FIELDS: metric_id = indexer.resolve(MetricKey.SESSION_DURATION.value) if metric_id is not None: def get_virtual_column( field: SessionsQueryFunction) -> _VirtualColumnName: return cast(_VirtualColumnName, field[:3]) # Filter down # to healthy sessions, because that's what sessions_v2 exposes: healthy = indexer.resolve("exited") if healthy is None: # There are no healthy sessions, return return [], {} column_condition = Function( "equals", (Column(tag_key_session_status), healthy)) snuba_column = _to_column(raw_field, column_condition) if snuba_column not in columns_fetched: data.extend( _get_snuba_query_data( org_id, query, EntityKey.MetricsDistributions, MetricKey.SESSION_DURATION, metric_id, [snuba_column], limit_state, )) columns_fetched.add(snuba_column) col = get_virtual_column(raw_field) metric_to_output_field[(MetricKey.SESSION_DURATION, col)] = _SessionDurationField( raw_field, col, group_by_status) if "sum(session)" == raw_field: metric_id = indexer.resolve(MetricKey.SESSION.value) if metric_id is not None: if group_by_status: # We need session counters grouped by status, as well as the number of errored sessions # 1 session counters data.extend( _get_snuba_query_data( org_id, query, EntityKey.MetricsCounters, MetricKey.SESSION, metric_id, [ # The order of these columns is important, because # the first column might get used in order by get_column_for_status("sum", "sessions", "init"), get_column_for_status("sum", "sessions", "abnormal"), get_column_for_status("sum", "sessions", "crashed"), get_column_for_status("sum", "sessions", "errored_preaggr"), ], limit_state, )) # 2: session.error error_metric_id = indexer.resolve( MetricKey.SESSION_ERROR.value) if error_metric_id is not None: # Should not limit session.error to session.status=X, # because that tag does not exist for this metric limit_state.skip_columns.add( Column(tag_key_session_status)) data.extend( _get_snuba_query_data( org_id, query, EntityKey.MetricsSets, MetricKey.SESSION_ERROR, error_metric_id, [Function("uniq", [Column("value")], "value")], limit_state, )) # Remove skip_column again: limit_state.skip_columns.remove( Column(tag_key_session_status)) else: # Simply count the number of started sessions: init = indexer.resolve("init") if tag_key_session_status is not None and init is not None: extra_conditions = [ Condition(Column(tag_key_session_status), Op.EQ, init) ] data.extend( _get_snuba_query_data( org_id, query, EntityKey.MetricsCounters, MetricKey.SESSION, metric_id, [Function("sum", [Column("value")], "value")], limit_state, extra_conditions, )) metric_to_output_field[(MetricKey.SESSION, "value")] = _SumSessionField() return data, metric_to_output_field
def _get_snuba_query( org_id: int, query: QueryDefinition, entity_key: EntityKey, metric_id: int, columns: List[SelectableExpression], series: bool, limit_state: _LimitState, extra_conditions: List[Condition], ) -> Optional[Query]: """Build the snuba query Return None if the results from the initial totals query was empty. """ conditions = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, query.filter_keys["project_id"]), Condition(Column("metric_id"), Op.EQ, metric_id), Condition(Column(TS_COL_QUERY), Op.GTE, query.start), Condition(Column(TS_COL_QUERY), Op.LT, query.end), ] conditions += _get_filter_conditions(org_id, query.conditions) conditions += extra_conditions groupby = {} for field in query.raw_groupby: if field == "session.status": # This will be handled by conditional aggregates continue if field == "project": groupby["project"] = Column("project_id") continue try: groupby[field] = Column(resolve_tag_key(field)) except MetricIndexNotFound: # exclude unresolved keys from groupby pass full_groupby = list(set(groupby.values())) if series: full_groupby.append(Column(TS_COL_GROUP)) query_args = dict( dataset=Dataset.Metrics.value, match=Entity(entity_key.value), select=columns, groupby=full_groupby, where=conditions, granularity=Granularity(query.rollup), ) # In case of group by, either set a limit or use the groups from the # first query to limit the results: if query.raw_groupby: if not limit_state.initialized: # Set limit and order by to be consistent with sessions_v2 max_groups = SNUBA_LIMIT // len(get_timestamps(query)) query_args["limit"] = Limit(max_groups) query_args["orderby"] = [OrderBy(columns[0], Direction.DESC)] else: if limit_state.limiting_conditions is None: # Initial query returned no results, no need to run any more queries return None query_args["where"] += limit_state.limiting_conditions query_args["limit"] = Limit(SNUBA_LIMIT) return Query(**query_args)
def get_series(projects: Sequence[Project], query: QueryDefinition) -> dict: """Get time series for the given query""" intervals = list(get_intervals(query)) results = {} if not query.groupby: # When there is no groupBy columns specified, we don't want to go through running an # initial query first to get the groups because there are no groups, and it becomes just # one group which is basically identical to eliminating the orderBy altogether query.orderby = None if query.orderby is not None: # ToDo(ahmed): Now that we have conditional aggregates as select statements, we might be # able to shave off a query here. we only need the other queries for fields spanning other # entities otherwise if all the fields belong to one entity then there is no need # There is a known limitation that since we make two queries, where we use the results of # the first query to filter down the results of the second query, so if the field used to # order by has no values for certain transactions for example in the case of the # performance table, we might end up showing less transactions than there actually are if # we choose to order by it. We are limited by the rows available for the field used in # the orderBy. # Multi-field select with order by functionality. Currently only supports the # performance table. original_query_fields = copy(query.fields) # The initial query has to contain only one field which is the same as the order by # field orderby_field = [key for key, value in query.fields.items() if value == query.orderby[0]][0] query.fields = {orderby_field: parse_field(orderby_field)} snuba_queries = SnubaQueryBuilder(projects, query).get_snuba_queries() if len(snuba_queries) > 1: # Currently accepting an order by field that spans multiple entities is not # supported, but it might change in the future. Even then, it might be better # handled on the snuba side of things raise InvalidParams( "Order by queries over multiple entities are not supported in " "multi-field select with order by clause queries" ) try: # This query contains an order by clause, and so we are only interested in the # "totals" query initial_snuba_query = next(iter(snuba_queries.values()))["totals"] initial_query_results = raw_snql_query( initial_snuba_query, use_cache=False, referrer="api.metrics.totals.initial_query" )["data"] except StopIteration: # This can occur when requesting a list of derived metrics that are not have no data # for the passed projects initial_query_results = [] # If we do not get any results from the first query, then there is no point in making # the second query if initial_query_results: # We no longer want the order by in the 2nd query because we already have the order of # the group by tags from the first query so we basically remove the order by columns, # and reset the query fields to the original fields because in the second query, # we want to query for all the metrics in the request api call query.orderby = None query.fields = original_query_fields snuba_queries = SnubaQueryBuilder(projects, query).get_snuba_queries() # Translate the groupby fields of the query into their tag keys because these fields # will be used to filter down and order the results of the 2nd query. # For example, (project_id, transaction) is translated to (project_id, tags[3]) groupby_tags = tuple( resolve_tag_key(field) if field not in ALLOWED_GROUPBY_COLUMNS else field for field in query.groupby ) # Dictionary that contains the conditions that are required to be added to the where # clause of the second query. In addition to filtering down on the tuple combination # of the fields in the group by columns, we need a separate condition for each of # the columns in the group by with their respective values so Clickhouse can # filter the results down before checking for the group by column combinations. ordered_tag_conditions = { col: list({data_elem[col] for data_elem in initial_query_results}) for col in groupby_tags } ordered_tag_conditions[groupby_tags] = [ tuple(data_elem[col] for col in groupby_tags) for data_elem in initial_query_results ] for entity, queries in snuba_queries.items(): results.setdefault(entity, {}) # This loop has constant time complexity as it will always have a maximum of # three queries corresponding to the three available entities # ["metrics_sets", "metrics_distributions", "metrics_counters"] for key, snuba_query in queries.items(): results[entity].setdefault(key, {"data": []}) # If query is grouped by project_id, then we should remove the original # condition project_id cause it might be more relaxed than the project_id # condition in the second query where = [] for condition in snuba_query.where: if not ( isinstance(condition.lhs, Column) and condition.lhs.name == "project_id" and "project_id" in groupby_tags ): where += [condition] # Adds the conditions obtained from the previous query for condition_key, condition_value in ordered_tag_conditions.items(): if not condition_key or not condition_value: # Safeguard to prevent adding empty conditions to the where clause continue lhs_condition = ( Function("tuple", [Column(col) for col in condition_key]) if isinstance(condition_key, tuple) else Column(condition_key) ) where += [ Condition(lhs_condition, Op.IN, Function("tuple", condition_value)) ] snuba_query = snuba_query.set_where(where) # Set the limit of the second query to be the provided limits multiplied by # the number of the metrics requested in the query in this specific entity snuba_query = snuba_query.set_limit( snuba_query.limit.limit * len(snuba_query.select) ) snuba_query = snuba_query.set_offset(0) snuba_query_res = raw_snql_query( snuba_query, use_cache=False, referrer=f"api.metrics.{key}.second_query" ) # Create a dictionary that has keys representing the ordered by tuples from the # initial query, so that we are able to order it easily in the next code block # If for example, we are grouping by (project_id, transaction) -> then this # logic will output a dictionary that looks something like, where `tags[1]` # represents transaction # { # (3, 2): [{"metric_id": 4, "project_id": 3, "tags[1]": 2, "p50": [11.0]}], # (3, 3): [{"metric_id": 4, "project_id": 3, "tags[1]": 3, "p50": [5.0]}], # } snuba_query_data_dict = {} for data_elem in snuba_query_res["data"]: snuba_query_data_dict.setdefault( tuple(data_elem[col] for col in groupby_tags), [] ).append(data_elem) # Order the results according to the results of the initial query, so that when # the results dict is passed on to `SnubaResultsConverter`, it comes out ordered # Ordered conditions might for example look something like this # {..., ('project_id', 'tags[1]'): [(3, 3), (3, 2)]}, then we end up with # { # "totals": { # "data": [ # { # "metric_id": 5, "project_id": 3, "tags[1]": 3, "count_unique": 5 # }, # { # "metric_id": 5, "project_id": 3, "tags[1]": 2, "count_unique": 1 # }, # ] # } # } for group_tuple in ordered_tag_conditions[groupby_tags]: results[entity][key]["data"] += snuba_query_data_dict.get(group_tuple, []) else: snuba_queries = SnubaQueryBuilder(projects, query).get_snuba_queries() for entity, queries in snuba_queries.items(): results.setdefault(entity, {}) for key, snuba_query in queries.items(): if snuba_query is None: continue results[entity][key] = raw_snql_query( snuba_query, use_cache=False, referrer=f"api.metrics.{key}" ) assert projects converter = SnubaResultConverter(projects[0].organization_id, query, intervals, results) return { "start": query.start, "end": query.end, "query": query.query, "intervals": intervals, "groups": converter.translate_results(), }