def run_metrics_query( *, entity_key: EntityKey, select: List[Column], where: List[Condition], groupby: List[Column], projects: Sequence[Project], org_id: int, referrer: str, ) -> Mapping[str, Any]: # Round timestamp to minute to get cache efficiency: now = datetime.now().replace(second=0, microsecond=0) query = Query( dataset=Dataset.Metrics.value, match=Entity(entity_key.value), select=select, groupby=groupby, where=[ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, [p.id for p in projects]), Condition(Column(TS_COL_QUERY), Op.GTE, now - timedelta(hours=24)), Condition(Column(TS_COL_QUERY), Op.LT, now), ] + where, granularity=Granularity(GRANULARITY), ) result = raw_snql_query(query, referrer, use_cache=True) return result["data"]
def _get_hash_for_parent_level(group: Group, id: int, levels_overview: LevelsOverview) -> str: # If this is violated, there cannot be a 1:1 mapping between level and hash. assert 0 <= id < levels_overview.current_level # This cache never needs explicit invalidation because during every level # change, the group ID changes. # # No idea if the query is slow, caching just because I can. cache_key = f"group-parent-level-hash:{group.id}:{id}" return_hash: str = cache.get(cache_key) if return_hash is None: query = (Query("events", Entity("events")).set_select([ Function("arrayElement", [Column("hierarchical_hashes"), id + 1], "hash") ]).set_where(_get_group_filters(group)).set_limit(1)) return_hash: str = get_path(snuba.raw_snql_query(query), "data", 0, "hash") # type: ignore cache.set(cache_key, return_hash) assert return_hash return return_hash
def data_fn(offset, limit): if use_snql: trend_query.offset = Offset(offset) trend_query.limit = Limit(limit) result = raw_snql_query( trend_query.get_snql_query(), referrer="api.trends.get-percentage-change.wip-snql", ) result = discover.transform_results( result, trend_query.function_alias_map, {}, None ) return result else: return discover.query( selected_columns=selected_columns + trend_columns, query=query, params=params, orderby=orderby, offset=offset, limit=limit, referrer="api.trends.get-percentage-change", auto_fields=True, auto_aggregations=True, use_aggregate_conditions=True, )
def get_event_stats( query_columns: Sequence[str], query: str, params: Dict[str, str], rollup: int, zerofill_results: bool, comparison_delta: Optional[datetime] = None, ) -> SnubaTSResult: with sentry_sdk.start_span( op="discover.discover", description="timeseries.filter_transform"): builder = TimeseriesQueryBuilder( Dataset.Discover, params, rollup, query=query, selected_columns=query_columns, functions_acl=[ "array_join", "percentileArray", "sumArray" ], ) span_op_column = builder.resolve_function( "array_join(spans_op)") span_group_column = builder.resolve_function( "array_join(spans_group)") # Adding spans.op and spans.group to the group by because # We need them in the query to help the array join optimizer # in snuba take effect but the TimeseriesQueryBuilder # removes all non aggregates from the select clause. builder.groupby.extend([span_op_column, span_group_column]) builder.add_conditions([ Condition( Function("tuple", [span_op_column, span_group_column]), Op.IN, Function("tuple", [Function("tuple", [span.op, span.group])]), ), ]) snql_query = builder.get_snql_query() results = raw_snql_query( snql_query, "api.organization-events-spans-performance-stats") with sentry_sdk.start_span( op="discover.discover", description="timeseries.transform_results"): result = discover.zerofill( results["data"], params["start"], params["end"], rollup, "time", ) return SnubaTSResult({"data": result}, params["start"], params["end"], rollup)
def _get_snuba_query_data( org_id: int, query: QueryDefinition, entity_key: EntityKey, metric_name: _MetricName, metric_id: int, columns: Sequence[str], extra_conditions: Optional[List[Condition]] = None, remove_groupby: Optional[Set[Column]] = None, ) -> Generator[Tuple[_MetricName, _SnubaData], None, None]: """Get data from snuba""" if extra_conditions is None: extra_conditions = [] if remove_groupby is None: remove_groupby = set() for query_type in ("series", "totals"): snuba_query = _get_snuba_query( org_id, query, entity_key, metric_id, columns, series=query_type == "series", extra_conditions=extra_conditions, remove_groupby=remove_groupby, ) referrer = REFERRERS[metric_name][query_type] query_data = raw_snql_query(snuba_query, referrer=referrer)["data"] yield (metric_name, query_data)
def _check_releases_have_health_data( organization_id: int, project_ids: List[int], release_versions: List[str], start: datetime, end: datetime, ) -> Set[str]: """ Returns a set of all release versions that have health data within a given period of time. """ if not release_versions: return set() query = Query( dataset="sessions", match=Entity("sessions"), select=[Column("release")], groupby=[Column("release")], where=[ Condition(Column("started"), Op.GTE, start), Condition(Column("started"), Op.LT, end), Condition(Column("org_id"), Op.EQ, organization_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("release"), Op.IN, release_versions), ], ) data = snuba.raw_snql_query(query, referrer="snuba.sessions.check_releases_have_health_data")[ "data" ] return {row["release"] for row in data}
def get_levels_overview(group): query = (Query("events", Entity("events")).set_select([ Column("primary_hash"), Function("max", [Function("length", [Column("hierarchical_hashes")])], "num_levels"), _current_level_expr(group), ]).set_where(_get_group_filters(group)).set_groupby( [Column("primary_hash")])) res = snuba.raw_snql_query( query, referrer="api.group_hashes_levels.get_levels_overview") if not res["data"]: raise NoEvents() if len(res["data"]) > 1: raise MergedIssues() assert len(res["data"]) == 1 fields = res["data"][0] if fields["num_levels"] <= 0: raise NotHierarchical() # TODO: Cache this if it takes too long. This is called from multiple # places, grouping overview and then again in the new-issues endpoint. return LevelsOverview( current_level=fields["current_level"] - 1, only_primary_hash=fields["primary_hash"], num_levels=fields["num_levels"], )
def _get_full_hierarchical_hashes(group: Group, hash: str) -> Optional[Sequence[str]]: query = ( Query("events", Entity("events")) .set_select( [ Column("hierarchical_hashes"), ] ) .set_where( _get_group_filters(group) + [ Condition( Function( "has", [Column("hierarchical_hashes"), hash], ), Op.EQ, 1, ), ] ) ) data = snuba.raw_snql_query(query, referrer="group_split.get_full_hierarchical_hashes")["data"] if not data: return None return data[0]["hierarchical_hashes"]
def get_series(self, project: Project, query: QueryDefinition) -> dict: """Get time series for the given query""" intervals = list(query.get_intervals()) snuba_queries = SnubaQueryBuilder(project, query).get_snuba_queries() results = { entity: { # TODO: Should we use cache? key: raw_snql_query(query, use_cache=False, referrer=f"api.metrics.{key}") for key, query in queries.items() } for entity, queries in snuba_queries.items() } converter = SnubaResultConverter(project.organization_id, query, intervals, results) return { "start": query.start, "end": query.end, "query": query.query, "intervals": intervals, "groups": converter.translate_results(), }
def _get_data(entity_key: EntityKey, metric_name: str) -> Tuple[int, int]: total = 0 crashed = 0 metric_id = try_get_string_index(org_id, metric_name) if metric_id is not None: where = conditions + [ Condition(Column("metric_id"), Op.EQ, metric_id), Condition(Column("timestamp"), Op.LT, end), ] data = raw_snql_query( Query( dataset=Dataset.Metrics.value, match=Entity(entity_key.value), select=[Column("value")], where=where, groupby=[Column(status_key)], ), referrer= "release_health.metrics.crash-free-breakdown.session", )["data"] for row in data: if row[status_key] == status_init: total = int(row["value"]) elif row[status_key] == status_crashed: crashed = int(row["value"]) return total, crashed
def _get_snuba_query_data( org_id: int, query: QueryDefinition, entity_key: EntityKey, metric_key: MetricKey, metric_id: int, columns: List[SelectableExpression], limit_state: _LimitState, extra_conditions: Optional[List[Condition]] = None, ) -> Generator[Tuple[MetricKey, _SnubaData], None, None]: """Get data from snuba""" for query_type in ("totals", "series"): snuba_query = _get_snuba_query( org_id, query, entity_key, metric_id, columns, series=query_type == "series", limit_state=limit_state, extra_conditions=extra_conditions or [], ) referrer = REFERRERS[metric_key][query_type] if snuba_query is None: query_data = [] else: query_data = raw_snql_query(snuba_query, referrer=referrer)["data"] limit_state.update(snuba_query.groupby, query_data) yield (metric_key, query_data)
def wip_snql_query( selected_columns, query, params, equations=None, orderby=None, offset=None, limit=50, referrer=None, auto_fields=False, auto_aggregations=False, use_aggregate_conditions=False, conditions=None, functions_acl=None, ): """ Replacement API for query using snql, this function is still a work in progress and is not ready for use in production """ builder = QueryBuilder( Dataset.Discover, params, query=query, selected_columns=selected_columns, orderby=orderby, use_aggregate_conditions=use_aggregate_conditions, limit=limit, ) snql_query = builder.get_snql_query() results = raw_snql_query(snql_query, referrer) return results
def monitor_release_adoption(**kwargs): metrics.incr("sentry.tasks.monitor_release_adoption.start", sample_rate=1.0) # 1. Query snuba for all project ids that have sessions. with metrics.timer( "sentry.tasks.monitor_release_adoption.aggregate_projects.loop", sample_rate=1.0 ): aggregated_projects = defaultdict(list) start_time = time.time() offset = 0 while (time.time() - start_time) < MAX_SECONDS: query = ( Query( dataset="sessions", match=Entity("org_sessions"), select=[ Column("org_id"), Column("project_id"), ], groupby=[Column("org_id"), Column("project_id")], where=[ Condition( Column("started"), Op.GTE, datetime.utcnow() - timedelta(hours=6) ), Condition(Column("started"), Op.LT, datetime.utcnow()), ], granularity=Granularity(3600), orderby=[ OrderBy(Column("org_id"), Direction.ASC), OrderBy(Column("project_id"), Direction.ASC), ], ) .set_limit(CHUNK_SIZE + 1) .set_offset(offset) ) data = snuba.raw_snql_query(query, referrer="tasks.monitor_release_adoption")["data"] count = len(data) more_results = count > CHUNK_SIZE offset += CHUNK_SIZE if more_results: data = data[:-1] for row in data: aggregated_projects[row["org_id"]].append(row["project_id"]) if not more_results: break else: logger.info( "monitor_release_adoption.loop_timeout", sample_rate=1.0, extra={"offset": offset}, ) with metrics.timer( "sentry.tasks.monitor_release_adoption.process_projects_with_sessions", sample_rate=1.0 ): for org_id in aggregated_projects: process_projects_with_sessions.delay(org_id, aggregated_projects[org_id])
def build_project_usage_outcomes(start__stop, project): start, stop = start__stop # XXX(epurkhiser): Tsdb used to use day buckets, where the end would # represent a whole day. Snuba queries more accurately thus we must # capture the entire last day end = stop + timedelta(days=1) query = Query( dataset=Dataset.Outcomes.value, match=Entity("outcomes"), select=[ Column("outcome"), Column("category"), Function("sum", [Column("quantity")], "total"), ], where=[ Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, end), Condition(Column("project_id"), Op.EQ, project.id), Condition(Column("org_id"), Op.EQ, project.organization_id), Condition( Column("outcome"), Op.IN, [Outcome.ACCEPTED, Outcome.FILTERED, Outcome.RATE_LIMITED]), Condition( Column("category"), Op.IN, [*DataCategory.error_categories(), DataCategory.TRANSACTION], ), ], groupby=[Column("outcome"), Column("category")], granularity=Granularity(ONE_DAY), ) data = raw_snql_query(query, referrer="reports.outcomes")["data"] return ( # Accepted errors sum(row["total"] for row in data if row["category"] in DataCategory.error_categories() and row["outcome"] == Outcome.ACCEPTED), # Dropped errors sum(row["total"] for row in data if row["category"] in DataCategory.error_categories() and row["outcome"] == Outcome.RATE_LIMITED), # accepted transactions sum(row["total"] for row in data if row["category"] == DataCategory.TRANSACTION and row["outcome"] == Outcome.ACCEPTED), # Dropped transactions sum(row["total"] for row in data if row["category"] == DataCategory.TRANSACTION and row["outcome"] == Outcome.RATE_LIMITED), )
def run_outcomes_query_timeseries(query: QueryDefinition) -> ResultSet: snql_query = Query( dataset=query.dataset.value, match=Entity(query.match), select=query.select_params, groupby=query.group_by + [Column(TS_COL)], where=query.conditions, limit=Limit(10000), offset=Offset(0), granularity=Granularity(query.rollup), ) result_timeseries = raw_snql_query(snql_query, referrer="outcomes.timeseries") return _format_rows(result_timeseries["data"], query)
def get_changed_project_release_model_adoptions( self, project_ids: Sequence[ProjectId], ) -> Sequence[ProjectRelease]: now = datetime.now(pytz.utc) start = now - timedelta(days=3) projects_ids = list(project_ids) if len(projects_ids) == 0: return [] org_id = self._get_org_id(project_ids) release_column_name = tag_key(org_id, "release") query_cols = [Column("project_id"), Column(release_column_name)] group_by = query_cols where_clause = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, now), ] query = Query( dataset=Dataset.Metrics.value, match=Entity("metrics_counters"), select=query_cols, where=where_clause, groupby=group_by, ) result = raw_snql_query( query, referrer= "release_health.metrics.get_changed_project_release_model_adoptions", use_cache=False, ) def extract_row_info( row: Mapping[str, Union[OrganizationId, str]]) -> ProjectRelease: return row.get("project_id"), reverse_tag_value( org_id, row.get(release_column_name)) # type: ignore return [extract_row_info(row) for row in result["data"]]
def test_basic(self) -> None: now = datetime.now() self._insert_event_for_time(now) query = (Query(dataset="events", match=Entity("events")).set_select([ Function("count", [], "count") ]).set_groupby([Column("project_id")]).set_where([ Condition(Column("project_id"), Op.EQ, self.project.id), Condition(Column("timestamp"), Op.GTE, now - timedelta(days=1)), Condition(Column("timestamp"), Op.LT, now + timedelta(days=1)), ])) result = snuba.raw_snql_query(query) assert len(result["data"]) == 1 assert result["data"][0] == {"count": 1, "project_id": self.project.id}
def _get_project_releases_count( organization_id: int, project_ids: Sequence[int], scope: str, stats_period: Optional[str] = None, environments: Optional[Sequence[str]] = None, ) -> int: """ Fetches the total count of releases/project combinations """ if stats_period is None: stats_period = "24h" # Special rule that we support sorting by the last 24h only. if scope.endswith("_24h"): stats_period = "24h" _, stats_start, _ = get_rollup_starts_and_buckets(stats_period) where = [ Condition(Column("started"), Op.GTE, stats_start), Condition(Column("started"), Op.LT, datetime.now()), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("org_id"), Op.EQ, organization_id), ] if environments is not None: where.append(Condition(Column("environment"), Op.IN, environments)) having = [] # Filter out releases with zero users when sorting by either `users` or `crash_free_users` if scope in ["users", "crash_free_users"]: having.append(Condition(Column("users"), Op.GT, 0)) query = Query( dataset="sessions", match=Entity("sessions"), select=[ Function( "uniqExact", [Column("release"), Column("project_id")], alias="count") ], where=where, having=having, ) data = snuba.raw_snql_query( query, referrer="snuba.sessions.get_project_releases_count")["data"] return data[0]["count"] if data else 0
def data_fn(offset: int, limit: int) -> Any: builder = QueryBuilder( dataset=Dataset.Discover, params=params, selected_columns=["spans_op", "count()"], array_join="spans_op", query=query, limit=limit, offset=offset, orderby="-count", ) snql_query = builder.get_snql_query() results = raw_snql_query(snql_query, "api.organization-events-span-ops") return [ SpanOp(op=row["spans_op"], count=row["count"]) for row in results["data"] ]
def check_releases_have_health_data( self, organization_id: OrganizationId, project_ids: Sequence[ProjectId], release_versions: Sequence[ReleaseName], start: datetime, end: datetime, ) -> Set[ReleaseName]: release_column_name = tag_key(organization_id, "release") releases_ids = [ release_id for release_id in [ try_get_string_index(organization_id, release) for release in release_versions ] if release_id is not None ] query = Query( dataset=Dataset.Metrics.value, match=Entity("metrics_counters"), select=[Column(release_column_name)], where=[ Condition(Column("org_id"), Op.EQ, organization_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(organization_id, "session")), Condition(Column(release_column_name), Op.IN, releases_ids), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, end), ], groupby=[Column(release_column_name)], ) result = raw_snql_query( query, referrer="release_health.metrics.check_releases_have_health_data", use_cache=False, ) def extract_row_info( row: Mapping[str, Union[OrganizationId, str]]) -> ReleaseName: return reverse_tag_value( organization_id, row.get(release_column_name)) # type: ignore return {extract_row_info(row) for row in result["data"]}
def test_cache(self): """Minimal test to verify if use_cache works""" results = snuba.raw_snql_query( Query( "events", Entity("events"), select=[Column("event_id")], where=[ Condition(Column("project_id"), Op.EQ, self.project.id), Condition(Column("timestamp"), Op.GTE, timezone.now() - timedelta(days=1)), Condition(Column("timestamp"), Op.LT, timezone.now()), ], limit=Limit(1), ), use_cache=True, ) assert results["data"] == []
def _get_crash_free_rate_data( org_id: int, project_ids: Sequence[int], start: datetime, end: datetime, rollup: int, ) -> Dict[int, Dict[str, float]]: data: Dict[int, Dict[str, float]] = {} session_status = tag_key(org_id, "session.status") count_query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsCounters.value), select=[Column("value")], where=[ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, end), ], groupby=[ Column("project_id"), Column(session_status), ], granularity=Granularity(rollup), ) count_data = raw_snql_query( count_query, referrer="release_health.metrics.get_crash_free_data", use_cache=False)["data"] for row in count_data: project_data = data.setdefault(row["project_id"], {}) tag_value = reverse_tag_value(org_id, row[session_status]) project_data[tag_value] = row["value"] return data
def _get_snuba_query_data( org_id: int, query: QueryDefinition, entity_key: EntityKey, metric_key: MetricKey, metric_id: int, columns: List[SelectableExpression], limit_state: _LimitState, extra_conditions: Optional[List[Condition]] = None, ) -> Generator[Tuple[MetricKey, _SnubaData], None, None]: """Get data from snuba""" for query_type in ("totals", "series"): snuba_query = _get_snuba_query( org_id, query, entity_key, metric_id, columns, series=query_type == "series", limit_state=limit_state, extra_conditions=extra_conditions or [], ) referrer = REFERRERS[metric_key][query_type] if snuba_query is None: query_data = [] else: query_data = raw_snql_query(snuba_query, referrer=referrer)["data"] if not query_data: # If the first totals query returned empty results, # 1. there is no need to query time series, # 2. we do not update the LimitState. This gives the next query # the chance to populate the groups. # For example: if the first totals query fetches count_uniq(users), # but a project does not track users at all, we should order by # the results of the second totals query instead. break assert snuba_query is not None limit_state.update(snuba_query.groupby, query_data) yield (metric_key, query_data)
def _count_users(total: bool, referrer: str) -> Dict[Any, int]: query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsSets.value), select=[Column("value")], where=_get_common_where(total) + [ Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "user")), ], groupby=_get_common_groupby(total), ) return _convert_results( raw_snql_query( query, referrer=referrer, use_cache=False, )["data"], total=total, )
def query_p95(interval): start, stop = interval query = Query( dataset=Dataset.Transactions.value, match=Entity("transactions"), select=[ Column("transaction_name"), Function("quantile(0.95)", [Column("duration")], "p95"), ], where=[ Condition(Column("finish_ts"), Op.GTE, start), Condition(Column("finish_ts"), Op.LT, stop + timedelta(days=1)), Condition(Column("transaction_name"), Op.IN, transaction_names), Condition(Column("project_id"), Op.EQ, project.id), ], groupby=[Column("transaction_name")], ) return raw_snql_query(query, referrer="reports.key_transactions.p95")
def build_key_errors(interval, project): start, stop = interval # Take the 3 most frequently occuring events query = Query( dataset=Dataset.Events.value, match=Entity("events"), select=[Column("group_id"), Function("count", [])], where=[ Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, stop + timedelta(days=1)), Condition(Column("project_id"), Op.EQ, project.id), ], groupby=[Column("group_id")], orderby=[OrderBy(Function("count", []), Direction.DESC)], limit=Limit(3), ) query_result = raw_snql_query(query, referrer="reports.key_errors") key_errors = query_result["data"] return [(e["group_id"], e["count()"]) for e in key_errors]
def get_oldest_health_data_for_releases( self, project_releases: Sequence[ProjectRelease], ) -> Mapping[ProjectRelease, str]: now = datetime.now(pytz.utc) start = now - timedelta(days=90) project_ids: List[ProjectId] = [x[0] for x in project_releases] org_id = self._get_org_id(project_ids) release_column_name = tag_key(org_id, "release") releases = [x[1] for x in project_releases] releases_ids = [ release_id for release_id in [try_get_string_index(org_id, release) for release in releases] if release_id is not None ] query_cols = [ Column("project_id"), Column(release_column_name), Function("min", [Column("bucketed_time")], "oldest"), ] group_by = [ Column("project_id"), Column(release_column_name), ] where_clause = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, now), Condition(Column(release_column_name), Op.IN, releases_ids), ] query = Query( dataset=Dataset.Metrics.value, match=Entity("metrics_counters"), select=query_cols, where=where_clause, groupby=group_by, granularity=Granularity(3600), ) rows = raw_snql_query( query, referrer= "release_health.metrics.get_oldest_health_data_for_releases", use_cache=False, )["data"] result = {} for row in rows: result[row["project_id"], reverse_tag_value(org_id, row[release_column_name] )] = row["oldest"] return result
def check_has_health_data( self, projects_list: Sequence[ProjectOrRelease] ) -> Set[ProjectOrRelease]: now = datetime.now(pytz.utc) start = now - timedelta(days=3) projects_list = list(projects_list) if len(projects_list) == 0: return set() includes_releases = isinstance(projects_list[0], tuple) if includes_releases: project_ids: List[ProjectId] = [x[0] for x in projects_list ] # type: ignore else: project_ids = projects_list # type: ignore org_id = self._get_org_id(project_ids) where_clause = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, now), ] if includes_releases: releases = [x[1] for x in projects_list] # type: ignore release_column_name = tag_key(org_id, "release") releases_ids = get_tag_values_list(org_id, releases) where_clause.append( Condition(Column(release_column_name), Op.IN, releases_ids)) column_names = ["project_id", release_column_name] else: column_names = ["project_id"] def extract_row_info_func( include_releases: bool, ) -> Callable[[Mapping[str, Union[int, str]]], ProjectOrRelease]: def f(row: Mapping[str, Union[int, str]]) -> ProjectOrRelease: if include_releases: return row["project_id"], reverse_tag_value( org_id, row.get(release_column_name)) # type: ignore else: return row["project_id"] # type: ignore return f extract_row_info = extract_row_info_func(includes_releases) query_cols = [Column(column_name) for column_name in column_names] group_by_clause = query_cols query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsCounters.value), select=query_cols, where=where_clause, groupby=group_by_clause, ) result = raw_snql_query( query, referrer="release_health.metrics.check_has_health_data", use_cache=False) return {extract_row_info(row) for row in result["data"]}
def get_release_sessions_time_bounds( self, project_id: ProjectId, release: ReleaseName, org_id: OrganizationId, environments: Optional[Sequence[EnvironmentName]] = None, ) -> ReleaseSessionsTimeBounds: select: List[SelectableExpression] = [ Function("min", [Column("timestamp")], "min"), Function("max", [Column("timestamp")], "max"), ] try: where: List[Union[BooleanCondition, Condition]] = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.EQ, project_id), Condition(Column(tag_key(org_id, "release")), Op.EQ, tag_value(org_id, release)), Condition(Column("timestamp"), Op.GTE, datetime.min), Condition(Column("timestamp"), Op.LT, datetime.now(pytz.utc)), ] if environments is not None: env_filter = get_tag_values_list(org_id, environments) if not env_filter: raise MetricIndexNotFound() where.append( Condition(Column(tag_key(org_id, "environment")), Op.IN, env_filter)) except MetricIndexNotFound: # Some filter condition can't be constructed and therefore can't be # satisfied. # # Ignore return type because of https://github.com/python/mypy/issues/8533 return { "sessions_lower_bound": None, "sessions_upper_bound": None } # type: ignore # XXX(markus): We know that this combination of queries is not fully # equivalent to the sessions-table based backend. Example: # # 1. Session sid=x is started with timestamp started=n # 2. Same sid=x is updated with new payload with timestamp started=n - 1 # # Old sessions backend would return [n - 1 ; n - 1] as range. # New metrics backend would return [n ; n - 1] as range. # # We don't yet know if this case is relevant. Session's started # timestamp shouldn't really change as session status is updated # though. try: # Take care of initial values for session.started by querying the # init counter. This should take care of most cases on its own. init_sessions_query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsCounters.value), select=select, where=where + [ Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column(tag_key(org_id, "session.status")), Op.EQ, tag_value(org_id, "init")), ], ) rows = raw_snql_query( init_sessions_query, referrer= "release_health.metrics.get_release_sessions_time_bounds.init_sessions", use_cache=False, )["data"] except MetricIndexNotFound: rows = [] try: # Take care of potential timestamp updates by looking at the metric # for session duration, which is emitted once a session is closed ("terminal state") # # There is a testcase checked in that tests specifically for a # session update that lowers session.started. We don't know if that # testcase matters particularly. terminal_sessions_query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsDistributions.value), select=select, where=where + [ Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session.duration")), ], ) rows.extend( raw_snql_query( terminal_sessions_query, referrer= "release_health.metrics.get_release_sessions_time_bounds.terminal_sessions", use_cache=False, )["data"]) except MetricIndexNotFound: pass # This check is added because if there are no sessions found, then the # aggregations query return both the sessions_lower_bound and the # sessions_upper_bound as `0` timestamp and we do not want that behaviour # by default # P.S. To avoid confusion the `0` timestamp which is '1970-01-01 00:00:00' # is rendered as '0000-00-00 00:00:00' in clickhouse shell formatted_unix_start_time = datetime.utcfromtimestamp(0).strftime( "%Y-%m-%dT%H:%M:%S+00:00") lower_bound: Optional[str] = None upper_bound: Optional[str] = None for row in rows: if set(row.values()) == {formatted_unix_start_time}: continue if lower_bound is None or row["min"] < lower_bound: lower_bound = row["min"] if upper_bound is None or row["max"] > upper_bound: upper_bound = row["max"] if lower_bound is None or upper_bound is None: return { "sessions_lower_bound": None, "sessions_upper_bound": None } # type: ignore def iso_format_snuba_datetime(date: str) -> str: return datetime.strptime( date, "%Y-%m-%dT%H:%M:%S+00:00").isoformat()[:19] + "Z" return { # type: ignore "sessions_lower_bound": iso_format_snuba_datetime(lower_bound), "sessions_upper_bound": iso_format_snuba_datetime(upper_bound), }
def query( self, projects: Sequence[Project], retention_window_start: Optional[datetime], group_queryset: QuerySet, environments: Sequence[Environment], sort_by: str, limit: int, cursor: Optional[Cursor], count_hits: bool, paginator_options: Mapping[str, Any], search_filters: Sequence[SearchFilter], date_from: Optional[datetime], date_to: Optional[datetime], max_hits=None, ) -> CursorResult: if not validate_cdc_search_filters(search_filters): raise InvalidQueryForExecutor( "Search filters invalid for this query executor") start, end, retention_date = self.calculate_start_end( retention_window_start, search_filters, date_from, date_to) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result e_event = self.entities["event"] e_group = self.entities["group"] where_conditions = [ Condition(Column("project_id", e_event), Op.IN, [p.id for p in projects]), Condition(Column("timestamp", e_event), Op.GTE, start), Condition(Column("timestamp", e_event), Op.LT, end), ] # TODO: This is still basically only handling status, handle this better once we introduce # more conditions. for search_filter in search_filters: where_conditions.append( Condition(Column(search_filter.key.name, e_group), Op.IN, search_filter.value.raw_value)) if environments: # TODO: Should this be handled via filter_keys, once we have a snql compatible version? where_conditions.append( Condition(Column("environment", e_event), Op.IN, [e.name for e in environments])) sort_func = self.aggregation_defs[self.sort_strategies[sort_by]] having = [] if cursor is not None: op = Op.GTE if cursor.is_prev else Op.LTE having.append(Condition(sort_func, op, cursor.value)) query = Query( "events", match=Join([Relationship(e_event, "grouped", e_group)]), select=[ Column("id", e_group), replace(sort_func, alias="score"), ], where=where_conditions, groupby=[Column("id", e_group)], having=having, orderby=[OrderBy(sort_func, direction=Direction.DESC)], limit=Limit(limit + 1), ) data = snuba.raw_snql_query( query, referrer="search.snuba.cdc_search.query")["data"] hits_query = Query( "events", match=Join([Relationship(e_event, "grouped", e_group)]), select=[ Function("uniq", [Column("id", e_group)], alias="count"), ], where=where_conditions, ) hits = None if count_hits: hits = snuba.raw_snql_query( hits_query, referrer="search.snuba.cdc_search.hits")["data"][0]["count"] paginator_results = SequencePaginator( [(row["score"], row["g.id"]) for row in data], reverse=True, **paginator_options, ).get_result(limit, cursor, known_hits=hits, max_hits=max_hits) # We filter against `group_queryset` here so that we recheck all conditions in Postgres. # Since replay between Postgres and Clickhouse can happen, we might get back results that # have changed state in Postgres. By rechecking them we guarantee than any returned results # have the correct state. # TODO: This can result in us returning less than a full page of results, but shouldn't # affect cursors. If we want to, we can iterate and query snuba until we manage to get a # full page. In practice, this will likely only skip a couple of results at worst, and # probably not be noticeable to the user, so holding off for now to reduce complexity. groups = group_queryset.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results