def _extract_data(self, entity, data, groups): tags = tuple((key, data[key]) for key in sorted(data.keys()) if key.startswith("tags[")) metric_name = reverse_resolve(data["metric_id"]) ops = self._ops_by_metric[metric_name] tag_data = groups.setdefault( tags, { "totals": {}, }, ) timestamp = data.pop(TS_COL_GROUP, None) if timestamp is not None: timestamp = parse_snuba_datetime(timestamp) for op in ops: key = f"{op}({metric_name})" field = _OP_TO_FIELD[entity][op].snuba_alias value = data[field] if field == "percentiles": value = value[Percentile[op].value] # If this is time series data, add it to the appropriate series. # Else, add to totals if timestamp is None: tag_data["totals"][key] = finite_or_none(value) else: series = tag_data.setdefault("series", {}).setdefault( key, len(self._intervals) * [_DEFAULT_AGGREGATES[op]] ) series_index = self._timestamp_index[timestamp] series[series_index] = finite_or_none(value)
def get_project_release_stats(project_id, release, stat, rollup, start, end, environments=None): assert stat in ("users", "sessions") filter_keys = {"project_id": [project_id]} conditions = [["release", "=", release]] if environments is not None: conditions.append(["environment", "IN", environments]) buckets = int((end - start).total_seconds() / rollup) stats = _make_stats(start, rollup, buckets, default=None) for rv in raw_query( dataset=Dataset.Sessions, selected_columns=[ "bucketed_started", "release", stat, stat + "_crashed", stat + "_abnormal", stat + "_errored", "duration_quantiles", ], groupby=["bucketed_started", "release", "project_id"], start=start, end=end, rollup=rollup, conditions=conditions, filter_keys=filter_keys, )["data"]: ts = parse_snuba_datetime(rv["bucketed_started"]) bucket = int((end - ts).total_seconds() / rollup) stats[bucket][1] = { stat: rv[stat], stat + "_crashed": rv[stat + "_crashed"], stat + "_abnormal": rv[stat + "_abnormal"], stat + "_errored": rv[stat + "_errored"] - rv[stat + "_crashed"], "duration_p50": _convert_duration(rv["duration_quantiles"][0]), "duration_p90": _convert_duration(rv["duration_quantiles"][1]), } for idx, bucket in enumerate(stats): if bucket[1] is None: stats[idx][1] = { stat: 0, stat + "_crashed": 0, stat + "_abnormal": 0, stat + "_errored": 0, "duration_p50": None, "duration_p90": None, } return stats
def _extract_data(self, data, groups): tags = tuple( (key, data[key]) for key in sorted(data.keys()) if (key.startswith("tags[") or key in ALLOWED_GROUPBY_COLUMNS)) tag_data = groups.setdefault( tags, { "totals": {}, "series": {} }, ) bucketed_time = data.pop(TS_COL_GROUP, None) if bucketed_time is not None: bucketed_time = parse_snuba_datetime(bucketed_time) # We query the union of the query_definition fields, and the fields_in_entities from the # QueryBuilder necessary as it contains the constituent instances of # SingularEntityDerivedMetric for instances of CompositeEntityDerivedMetric for op, metric_name in self._set_of_constituent_queries: key = f"{op}({metric_name})" if op else metric_name default_null_value = metric_object_factory( op, metric_name).generate_default_null_values() try: value = data[key] except KeyError: # This could occur when we have derived metrics that are generated from post # query operations, and so don't have a direct mapping to the query results # or also from raw_metrics that don't exist in clickhouse yet cleaned_value = default_null_value else: if op in OPERATIONS_PERCENTILES: value = value[0] cleaned_value = finite_or_none(value) if bucketed_time is None: # Only update the value, when either key does not exist or its a default if key not in tag_data["totals"] or tag_data["totals"][ key] == default_null_value: tag_data["totals"][key] = cleaned_value if bucketed_time is not None or tag_data["totals"][ key] == default_null_value: empty_values = len(self._intervals) * [default_null_value] series = tag_data["series"].setdefault(key, empty_values) if bucketed_time is not None: series_index = self._timestamp_index[bucketed_time] if series[series_index] == default_null_value: series[series_index] = cleaned_value
def build_project_series(start__stop, project): start, stop = start__stop rollup = ONE_DAY resolution, series = tsdb.get_optimal_rollup_series(start, stop, rollup) assert resolution == rollup, "resolution does not match requested value" clean = partial(clean_series, start, stop, rollup) def zerofill_clean(data): return clean(zerofill(data, start, stop, rollup, fill_default=0)) # Note: this section can be removed issue_ids = project.group_set.filter(status=GroupStatus.RESOLVED, resolved_at__gte=start, resolved_at__lt=stop).values_list( "id", flat=True) # TODO: The TSDB calls could be replaced with a SnQL call here tsdb_range_resolved = _query_tsdb_groups_chunked(tsdb.get_range, issue_ids, start, stop, rollup) resolved_error_series = reduce( merge_series, map(clean, tsdb_range_resolved.values()), clean([(timestamp, 0) for timestamp in series]), ) # end # Use outcomes to compute total errors and transactions outcomes_query = Query( dataset=Dataset.Outcomes.value, match=Entity("outcomes"), select=[ Column("time"), Column("category"), Function("sum", [Column("quantity")], "total"), ], where=[ Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, stop + timedelta(days=1)), Condition(Column("project_id"), Op.EQ, project.id), Condition(Column("org_id"), Op.EQ, project.organization_id), Condition(Column("outcome"), Op.EQ, Outcome.ACCEPTED), Condition( Column("category"), Op.IN, [*DataCategory.error_categories(), DataCategory.TRANSACTION], ), ], groupby=[Column("time"), Column("category")], granularity=Granularity(rollup), orderby=[OrderBy(Column("time"), Direction.ASC)], ) outcome_series = raw_snql_query(outcomes_query, referrer="reports.outcome_series") total_error_series = OrderedDict() for v in outcome_series["data"]: if v["category"] in DataCategory.error_categories(): timestamp = int(to_timestamp(parse_snuba_datetime(v["time"]))) total_error_series[timestamp] = total_error_series.get( timestamp, 0) + v["total"] total_error_series = zerofill_clean(list(total_error_series.items())) transaction_series = [(int(to_timestamp(parse_snuba_datetime(v["time"]))), v["total"]) for v in outcome_series["data"] if v["category"] == DataCategory.TRANSACTION] transaction_series = zerofill_clean(transaction_series) error_series = merge_series( resolved_error_series, total_error_series, lambda resolved, total: (resolved, total - resolved), # Resolved, Unresolved ) # Format of this series: [(resolved , unresolved, transactions)] return merge_series( error_series, transaction_series, lambda errors, transactions: errors + (transactions, ), )
def get_project_release_stats(project_id, release, stat, rollup, start, end, environments=None): assert stat in ("users", "sessions") filter_keys = {"project_id": [project_id]} conditions = [["release", "=", release]] if environments is not None: conditions.append(["environment", "IN", environments]) buckets = int((end - start).total_seconds() / rollup) stats = _make_stats(start, rollup, buckets, default=None) totals = {stat: 0, stat + "_crashed": 0, stat + "_abnormal": 0, stat + "_errored": 0} for rv in raw_query( dataset=Dataset.Sessions, selected_columns=[ "bucketed_started", stat, stat + "_crashed", stat + "_abnormal", stat + "_errored", "duration_quantiles", ], groupby=["bucketed_started"], start=start, end=end, rollup=rollup, conditions=conditions, filter_keys=filter_keys, )["data"]: ts = parse_snuba_datetime(rv["bucketed_started"]) bucket = int((ts - start).total_seconds() / rollup) stats[bucket][1] = { stat: rv[stat], stat + "_crashed": rv[stat + "_crashed"], stat + "_abnormal": rv[stat + "_abnormal"], stat + "_errored": rv[stat + "_errored"] - rv[stat + "_crashed"], "duration_p50": _convert_duration(rv["duration_quantiles"][0]), "duration_p90": _convert_duration(rv["duration_quantiles"][1]), } # Session stats we can sum up directly without another query # as the data becomes available. if stat == "sessions": for k in totals: totals[k] += rv[k] for idx, bucket in enumerate(stats): if bucket[1] is None: stats[idx][1] = { stat: 0, stat + "_crashed": 0, stat + "_abnormal": 0, stat + "_errored": 0, "duration_p50": None, "duration_p90": None, } # For users we need a secondary query over the entire time range if stat == "users": rows = raw_query( dataset=Dataset.Sessions, selected_columns=["users", "users_crashed", "users_abnormal", "users_errored"], start=start, end=end, conditions=conditions, filter_keys=filter_keys, )["data"] if rows: rv = rows[0] totals = { "users": rv["users"], "users_crashed": rv["users_crashed"], "users_abnormal": rv["users_abnormal"], "users_errored": rv["users_errored"] - rv["users_crashed"], } return stats, totals
def get_release_health_data_overview( project_releases, environments=None, summary_stats_period=None, health_stats_period=None, stat=None, ): """Checks quickly for which of the given project releases we have health data available. The argument is a tuple of `(project_id, release_name)` tuples. The return value is a set of all the project releases that have health data. """ if stat is None: stat = "sessions" assert stat in ("sessions", "users") _, summary_start, _ = get_rollup_starts_and_buckets(summary_stats_period or "24h") conditions, filter_keys = _get_conditions_and_filter_keys(project_releases, environments) stats_rollup, stats_start, stats_buckets = get_rollup_starts_and_buckets(health_stats_period) missing_releases = set(project_releases) rv = {} for x in raw_query( dataset=Dataset.Sessions, selected_columns=[ "release", "project_id", "duration_quantiles", "users", "sessions", "sessions_errored", "sessions_crashed", "users_crashed", ], groupby=["release", "project_id"], start=summary_start, conditions=conditions, filter_keys=filter_keys, )["data"]: rp = { "duration_p50": _convert_duration(x["duration_quantiles"][0]), "duration_p90": _convert_duration(x["duration_quantiles"][1]), "crash_free_users": ( 100 - x["users_crashed"] / float(x["users"]) * 100 if x["users"] else None ), "crash_free_sessions": ( 100 - x["sessions_crashed"] / float(x["sessions"]) * 100 if x["sessions"] else None ), "total_users": x["users"], "total_sessions": x["sessions"], "sessions_crashed": x["sessions_crashed"], "sessions_errored": x["sessions_errored"], "has_health_data": True, } if health_stats_period: rp["stats"] = { health_stats_period: _make_stats(stats_start, stats_rollup, stats_buckets) } rv[x["project_id"], x["release"]] = rp missing_releases.discard((x["project_id"], x["release"])) # Add releases without data points if missing_releases: # If we're already looking at a 90 day horizont we don't need to # fire another query, we can already assume there is no data. if summary_stats_period != "90d": has_health_data = check_has_health_data(missing_releases) else: has_health_data = () for key in missing_releases: rv[key] = { "duration_p50": None, "duration_p90": None, "crash_free_users": None, "crash_free_sessions": None, "total_users": 0, "total_sessions": 0, "sessions_crashed": 0, "sessions_errored": 0, "has_health_data": key in has_health_data, } if health_stats_period: rv[key]["stats"] = { health_stats_period: _make_stats(stats_start, stats_rollup, stats_buckets) } # Fill in release adoption release_adoption = get_release_adoption(project_releases, environments) for key in rv: adoption_info = release_adoption.get(key) or {} rv[key]["adoption"] = adoption_info.get("adoption") rv[key]["total_users_24h"] = adoption_info.get("users_24h") rv[key]["total_sessions_24h"] = adoption_info.get("sessions_24h") if health_stats_period: for x in raw_query( dataset=Dataset.Sessions, selected_columns=["release", "project_id", "bucketed_started", stat], groupby=["release", "project_id", "bucketed_started"], rollup=stats_rollup, start=stats_start, conditions=conditions, filter_keys=filter_keys, )["data"]: time_bucket = int( (parse_snuba_datetime(x["bucketed_started"]) - stats_start).total_seconds() / stats_rollup ) rv[x["project_id"], x["release"]]["stats"][health_stats_period][time_bucket][1] = x[ stat ] return rv
def get_project_release_stats(project_id, release, stat, rollup, start, end, environments=None): assert stat in ("users", "sessions") # since snuba end queries are exclusive of the time and we're bucketing to # a full hour, we need to round to the next hour since snuba is exclusive # on the end. end = to_datetime( (to_timestamp(end) // DATASET_BUCKET + 1) * DATASET_BUCKET) filter_keys = {"project_id": [project_id]} conditions = [["release", "=", release]] if environments is not None: conditions.append(["environment", "IN", environments]) buckets = int((end - start).total_seconds() / rollup) stats = _make_stats(start, rollup, buckets, default=None) # Due to the nature of the probabilistic data structures some # subtractions can become negative. As such we're making sure a number # never goes below zero to avoid confusion. totals = { stat: 0, stat + "_healthy": 0, stat + "_crashed": 0, stat + "_abnormal": 0, stat + "_errored": 0, } for rv in raw_query( dataset=Dataset.Sessions, selected_columns=[ "bucketed_started", stat, stat + "_crashed", stat + "_abnormal", stat + "_errored", "duration_quantiles", ], groupby=["bucketed_started"], start=start, end=end, rollup=rollup, conditions=conditions, filter_keys=filter_keys, referrer="sessions.release-stats-details", )["data"]: ts = parse_snuba_datetime(rv["bucketed_started"]) bucket = int((ts - start).total_seconds() / rollup) stats[bucket][1] = { stat: rv[stat], stat + "_healthy": max(0, rv[stat] - rv[stat + "_errored"]), stat + "_crashed": rv[stat + "_crashed"], stat + "_abnormal": rv[stat + "_abnormal"], stat + "_errored": max( 0, rv[stat + "_errored"] - rv[stat + "_crashed"] - rv[stat + "_abnormal"]), "duration_p50": _convert_duration(rv["duration_quantiles"][0]), "duration_p90": _convert_duration(rv["duration_quantiles"][1]), } # Session stats we can sum up directly without another query # as the data becomes available. if stat == "sessions": for k in totals: totals[k] += stats[bucket][1][k] for idx, bucket in enumerate(stats): if bucket[1] is None: stats[idx][1] = { stat: 0, stat + "_healthy": 0, stat + "_crashed": 0, stat + "_abnormal": 0, stat + "_errored": 0, "duration_p50": None, "duration_p90": None, } # For users we need a secondary query over the entire time range if stat == "users": rows = raw_query( dataset=Dataset.Sessions, selected_columns=[ "users", "users_crashed", "users_abnormal", "users_errored" ], start=start, end=end, conditions=conditions, filter_keys=filter_keys, referrer="sessions.crash-free-breakdown-users", )["data"] if rows: rv = rows[0] totals = { "users": rv["users"], "users_healthy": max(0, rv["users"] - rv["users_errored"]), "users_crashed": rv["users_crashed"], "users_abnormal": rv["users_abnormal"], "users_errored": max( 0, rv["users_errored"] - rv["users_crashed"] - rv["users_abnormal"]), } return stats, totals
def _get_release_health_data_overview( project_releases, environments=None, summary_stats_period=None, health_stats_period=None, stat=None, now=None, ): """Checks quickly for which of the given project releases we have health data available. The argument is a tuple of `(project_id, release_name)` tuples. The return value is a set of all the project releases that have health data. """ if stat is None: stat = "sessions" assert stat in ("sessions", "users") _, summary_start, _ = get_rollup_starts_and_buckets(summary_stats_period or "24h", now=now) conditions, filter_keys = _get_conditions_and_filter_keys( project_releases, environments) stats_rollup, stats_start, stats_buckets = get_rollup_starts_and_buckets( health_stats_period, now=now) missing_releases = set(project_releases) rv = {} for x in raw_query( dataset=Dataset.Sessions, selected_columns=[ "release", "project_id", "duration_quantiles", "sessions", "sessions_errored", "sessions_crashed", "sessions_abnormal", "users", "users_crashed", ], groupby=["release", "project_id"], start=summary_start, conditions=conditions, filter_keys=filter_keys, referrer="sessions.release-overview", )["data"]: rp = { "crash_free_users": (100 - x["users_crashed"] / float(x["users"]) * 100 if x["users"] else None), "crash_free_sessions": (100 - x["sessions_crashed"] / float(x["sessions"]) * 100 if x["sessions"] else None), "total_users": x["users"], "total_sessions": x["sessions"], "sessions_crashed": x["sessions_crashed"], "sessions_errored": max( 0, x["sessions_errored"] - x["sessions_crashed"] - x["sessions_abnormal"]), "has_health_data": True, } rp.update(extract_duration_quantiles(x)) if health_stats_period: rp["stats"] = { health_stats_period: _make_stats(stats_start, stats_rollup, stats_buckets) } rv[x["project_id"], x["release"]] = rp missing_releases.discard((x["project_id"], x["release"])) # Add releases without data points if missing_releases: # If we're already looking at a 90 day horizon we don't need to # fire another query, we can already assume there is no data. if summary_stats_period != "90d": has_health_data = release_health.check_has_health_data( missing_releases) else: has_health_data = () for key in missing_releases: rv[key] = { "duration_p50": None, "duration_p90": None, "crash_free_users": None, "crash_free_sessions": None, "total_users": 0, "total_sessions": 0, "sessions_crashed": 0, "sessions_errored": 0, "has_health_data": key in has_health_data, } if health_stats_period: rv[key]["stats"] = { health_stats_period: _make_stats(stats_start, stats_rollup, stats_buckets) } release_adoption = release_health.get_release_adoption( project_releases, environments) for key in rv: adoption_info = release_adoption.get(key) or {} rv[key]["adoption"] = adoption_info.get("adoption") rv[key]["sessions_adoption"] = adoption_info.get("sessions_adoption") rv[key]["total_users_24h"] = adoption_info.get("users_24h") rv[key]["total_project_users_24h"] = adoption_info.get( "project_users_24h") rv[key]["total_sessions_24h"] = adoption_info.get("sessions_24h") rv[key]["total_project_sessions_24h"] = adoption_info.get( "project_sessions_24h") if health_stats_period: for x in raw_query( dataset=Dataset.Sessions, selected_columns=[ "release", "project_id", "bucketed_started", stat ], groupby=["release", "project_id", "bucketed_started"], rollup=stats_rollup, start=stats_start, conditions=conditions, filter_keys=filter_keys, referrer="sessions.release-stats", )["data"]: time_bucket = int((parse_snuba_datetime(x["bucketed_started"]) - stats_start).total_seconds() / stats_rollup) key = (x["project_id"], x["release"]) # Sometimes this might return a release we haven't seen yet or it might # return a time bucket that did not exist yet at the time of the initial # query. In that case, just skip it. if key in rv and time_bucket < len( rv[key]["stats"][health_stats_period]): rv[key]["stats"][health_stats_period][time_bucket][1] = x[stat] return rv
def get_release_health_data_overview(project_releases, environments=None, stats_period=None): """Checks quickly for which of the given project releases we have health data available. The argument is a tuple of `(project_id, release_name)` tuples. The return value is a set of all the project releases that have health data. """ def _nan_as_none(val): return None if val != val else val yesterday = datetime.now(pytz.utc) - timedelta(days=1) conditions, filter_keys = _get_conditions_and_filter_keys( project_releases, environments) if stats_period == "24h": stats_rollup = 3600 stats_start = yesterday stats_buckets = 24 elif stats_period == "14d": stats_rollup = 86400 stats_start = datetime.now(pytz.utc) - timedelta(days=14) stats_buckets = 14 elif not stats_period: stats_rollup = None stats_start = None else: raise TypeError("Invalid stats period") total_users_24h = {} for x in raw_query( dataset=Dataset.Sessions, selected_columns=["release", "users"], groupby=["release", "project_id"], start=yesterday, conditions=conditions, filter_keys=filter_keys, )["data"]: total_users_24h[x["project_id"]] = x["users"] rv = {} for x in raw_query( dataset=Dataset.Sessions, selected_columns=[ "release", "project_id", "duration_quantiles", "users", "sessions", "sessions_errored", "sessions_crashed", "users_crashed", ], groupby=["release", "project_id"], start=yesterday, conditions=conditions, filter_keys=filter_keys, )["data"]: total_users = total_users_24h.get(x["project_id"]) rp = { "duration_p50": _nan_as_none(x["duration_quantiles"][0]), "duration_p90": _nan_as_none(x["duration_quantiles"][1]), "crash_free_users": (100 - x["users_crashed"] / float(x["users"]) * 100 if x["users"] else None), "crash_free_sessions": (100 - x["sessions_crashed"] / float(x["sessions"]) * 100 if x["sessions"] else None), "total_users": x["users"], "total_sessions": x["sessions"], "sessions_crashed": x["sessions_crashed"], "sessions_errored": x["sessions_errored"], "adoption": x["users"] / total_users * 100 if total_users and x["users"] else None, } if stats_period: rp["stats"] = { stats_period: _make_stats(stats_start, stats_rollup, stats_buckets) } rv[x["project_id"], x["release"]] = rp if stats_period: for x in raw_query( dataset=Dataset.Sessions, selected_columns=[ "release", "project_id", "bucketed_started", "sessions" ], groupby=["release", "project_id", "bucketed_started"], rollup=stats_rollup, start=stats_start, conditions=conditions, filter_keys=filter_keys, )["data"]: time_bucket = int((parse_snuba_datetime(x["bucketed_started"]) - stats_start).total_seconds() / stats_rollup) rv[x["project_id"], x["release"]]["stats"][stats_period][ time_bucket][1] = x["sessions"] return rv
def get_release_health_data_overview(project_releases, environments=None, summary_stats_period=None, health_stats_period=None): """Checks quickly for which of the given project releases we have health data available. The argument is a tuple of `(project_id, release_name)` tuples. The return value is a set of all the project releases that have health data. """ def _nan_as_none(val): return None if val != val else val _, summary_start, _ = get_rollup_starts_and_buckets(summary_stats_period or "24h") conditions, filter_keys = _get_conditions_and_filter_keys( project_releases, environments) stats_rollup, stats_start, stats_buckets = get_rollup_starts_and_buckets( health_stats_period) total_users = {} for x in raw_query( dataset=Dataset.Sessions, selected_columns=["release", "users"], groupby=["release", "project_id"], start=summary_start, conditions=conditions, filter_keys=filter_keys, )["data"]: total_users[x["project_id"]] = x["users"] missing_releases = set(project_releases) rv = {} for x in raw_query( dataset=Dataset.Sessions, selected_columns=[ "release", "project_id", "duration_quantiles", "users", "sessions", "sessions_errored", "sessions_crashed", "users_crashed", ], groupby=["release", "project_id"], start=summary_start, conditions=conditions, filter_keys=filter_keys, )["data"]: x_total_users = total_users.get(x["project_id"]) rp = { "duration_p50": _nan_as_none(x["duration_quantiles"][0]), "duration_p90": _nan_as_none(x["duration_quantiles"][1]), "crash_free_users": (100 - x["users_crashed"] / float(x["users"]) * 100 if x["users"] else None), "crash_free_sessions": (100 - x["sessions_crashed"] / float(x["sessions"]) * 100 if x["sessions"] else None), "total_users": x["users"], "total_sessions": x["sessions"], "sessions_crashed": x["sessions_crashed"], "sessions_errored": x["sessions_errored"], "adoption": x["users"] / x_total_users * 100 if x_total_users and x["users"] else None, "has_health_data": True, } if health_stats_period: rp["stats"] = { health_stats_period: _make_stats(stats_start, stats_rollup, stats_buckets) } rv[x["project_id"], x["release"]] = rp missing_releases.discard((x["project_id"], x["release"])) # Add releases without data points if missing_releases: has_health_data = check_has_health_data(missing_releases) for key in missing_releases: rv[key] = { "duration_p50": None, "duration_p90": None, "crash_free_users": None, "crash_free_sessions": None, "total_users": 0, "total_sessions": 0, "sessions_crashed": 0, "sessions_errored": 0, "adoption": None, "has_health_data": key in has_health_data, } if health_stats_period: rv[key]["stats"] = { health_stats_period: _make_stats(stats_start, stats_rollup, stats_buckets) } if health_stats_period: for x in raw_query( dataset=Dataset.Sessions, selected_columns=[ "release", "project_id", "bucketed_started", "sessions" ], groupby=["release", "project_id", "bucketed_started"], rollup=stats_rollup, start=stats_start, conditions=conditions, filter_keys=filter_keys, )["data"]: time_bucket = int((parse_snuba_datetime(x["bucketed_started"]) - stats_start).total_seconds() / stats_rollup) rv[x["project_id"], x["release"]]["stats"][health_stats_period][ time_bucket][1] = x["sessions"] return rv