def get(self, request, organization): """Find the event id with the closest value to an aggregate for a given query""" if not self.has_feature(organization, request): return Response(status=404) try: params = self.get_snuba_params(request, organization) except NoProjects: return Response(status=404) # Assumption is that users will want the 50th percentile baseline_function = request.GET.get("baselineFunction", "p50()") # If the baseline was calculated already save ourselves a query baseline_value = request.GET.get("baselineValue") baseline_alias = get_function_alias(baseline_function) with self.handle_query_errors(): if baseline_value is None: result = discover.query( selected_columns=[baseline_function], params=params, query=request.GET.get("query"), limit=1, referrer="api.transaction-baseline.get_value", ) baseline_value = result["data"][0].get( baseline_alias) if "data" in result else None if baseline_value is None: return Response(status=404) delta_column = f"absolute_delta(transaction.duration,{baseline_value})" result = discover.query( selected_columns=[ "project", "timestamp", "id", "transaction.duration", delta_column, ], # Find the most recent transaction that's closest to the baseline value # id is the last item of the orderby for consistent results orderby=[get_function_alias(delta_column), "-timestamp", "id"], params=params, query=request.GET.get("query"), limit=1, referrer="api.transaction-baseline.get_id", ) if len(result["data"]) == 0: return Response(status=404) baseline_data = result["data"][0] baseline_data[baseline_alias] = baseline_value return Response(baseline_data)
def normalize_histogram_results(fields, key_column, histogram_params, results, array_column): """ Normalizes the histogram results by renaming the columns to key and bin and make sure to zerofill any missing values. :param [str] fields: The list of fields for which you want to generate the histograms for. :param str key_column: The column of the key name. :param HistogramParms histogram_params: The histogram parameters used. :param any results: The results from the histogram query that may be missing bins and needs to be normalized. :param str array_column: Array column prefix """ # `key_name` is only used when generating a multi histogram of measurement values. # It contains the name of the corresponding measurement for that row. key_name = None if key_column is None else get_function_alias(key_column) histogram_column = get_histogram_column(fields, key_column, histogram_params, array_column) bin_name = get_function_alias(histogram_column) # zerofill and rename the columns while making sure to adjust for precision bucket_maps = {field: {} for field in fields} for row in results["data"]: # Fall back to the first field name if there is no `key_name`, # otherwise, this is an array value name and format it as such. key = ( fields[0] if key_name is None else f"{get_array_column_alias(array_column)}.{get_array_column_field(array_column, row[key_name])}" ) # we expect the bin the be an integer, this is because all floating # point values are rounded during the calculation bucket = int(row[bin_name]) # ignore unexpected keys if key in bucket_maps: bucket_maps[key][bucket] = row["count"] new_data = {field: [] for field in fields} for i in range(histogram_params.num_buckets): bucket = histogram_params.start_offset + histogram_params.bucket_size * i for field in fields: row = { "bin": bucket, "count": bucket_maps[field].get(bucket, 0), } # make sure to adjust for the precision if necessary if histogram_params.multiplier > 1: row["bin"] /= float(histogram_params.multiplier) new_data[field].append(row) return new_data
def __init__(self, organization_id, discover_query): self.projects = self.get_projects(organization_id, discover_query) self.environments = self.get_environments(organization_id, discover_query) self.start, self.end = get_date_range_from_params(discover_query) self.params = { "organization_id": organization_id, "project_id": [project.id for project in self.projects], "start": self.start, "end": self.end, } # make sure to only include environment if any are given # an empty list DOES NOT work if self.environments: self.params["environment"] = self.environments equations = discover_query.get("equations", []) self.header_fields = ( map(lambda x: get_function_alias(x), discover_query["field"]) + equations ) self.equation_aliases = { f"equation[{index}]": equation for index, equation in enumerate(equations) } self.data_fn = self.get_data_fn( fields=discover_query["field"], equations=equations, query=discover_query["query"], params=self.params, sort=discover_query.get("sort"), )
def visit_function_value(self, node, children): function_node, *_ = children function_name = function_node.text field = node.text if function_name not in self.function_allowlist: raise ArithmeticValidationError(f"{function_name} not allowed in arithmetic") self.functions.add(field) # use the alias to reference the function in arithmetic return get_function_alias(field)
def test_function_values(a, op, b): for with_brackets in [False, True]: equation = f"{a}{op}{b}" if with_brackets: equation = f"({equation}) + 5" result, fields, functions = parse_arithmetic(equation) if with_brackets: assert result.operator == "plus" assert isinstance(result.lhs, Operation) assert result.rhs == 5.0 result = result.lhs assert result.operator == op_map[op.strip()], equation lhs = a if isinstance(a, int) else get_function_alias(a) rhs = b if isinstance(b, int) else get_function_alias(b) assert result.lhs == lhs, equation assert result.rhs == rhs, equation assert len(fields) == 0 if isinstance(a, str): assert a in functions, equation if isinstance(b, str): assert b in functions, equation
def visit_function_value(self, node, children): function_node, *_ = children function_name = function_node.text field = node.text if function_name not in self.function_allowlist: raise ArithmeticValidationError(f"{function_name} not allowed in arithmetic") self.functions.add(field) if self.use_snql: return field else: # use the alias to reference the function in arithmetic # TODO(snql): once fully on snql no longer need the alias from sentry.search.events.fields import get_function_alias return get_function_alias(field)
def serialize_multiple_axis(self, serializer, event_result, columns, query_columns, allow_partial_buckets): # Return with requested yAxis as the key result = { columns[index]: serializer.serialize( event_result, get_function_alias(query_column), order=index, allow_partial_buckets=allow_partial_buckets, ) for index, query_column in enumerate(query_columns) } # Set order if multi-axis + top events if "order" in event_result.data: result["order"] = event_result.data["order"] return result
def validate(self, data): if not data.get("id"): keys = set(data.keys()) if self.required_for_create - keys: raise serializers.ValidationError( { "fields": "fields are required during creation.", "conditions": "conditions are required during creation.", } ) # Validate the query that would be created when run. conditions = self._get_attr(data, "conditions", "") fields = self._get_attr(data, "fields", []).copy() orderby = self._get_attr(data, "orderby", "") equations, fields = categorize_columns(fields) if equations is not None: resolved_equations, _ = resolve_equation_list(equations, fields) else: resolved_equations = [] try: # When using the eps/epm functions, they require an interval argument # or to provide the start/end so that the interval can be computed. # This uses a hard coded start/end to ensure the validation succeeds # since the values themselves don't matter. params = { "start": datetime.now() - timedelta(days=1), "end": datetime.now(), "project_id": [p.id for p in self.context.get("projects")], "organization_id": self.context.get("organization").id, } snuba_filter = get_filter(conditions, params=params) except InvalidSearchQuery as err: raise serializers.ValidationError({"conditions": f"Invalid conditions: {err}"}) if orderby: snuba_filter.orderby = get_function_alias(orderby) try: resolve_field_list(fields, snuba_filter, resolved_equations=resolved_equations) except InvalidSearchQuery as err: raise serializers.ValidationError({"fields": f"Invalid fields: {err}"}) return data
def resolve_axis_column(column: str, index: int = 0) -> str: return cast( str, get_function_alias(column) if not is_equation(column) else f"equation[{index}]" )
def get(self, request: Request, organization) -> Response: if not self.has_feature(organization, request): return Response(status=404) with sentry_sdk.start_span(op="discover.endpoint", description="parse params"): try: params = self.get_snuba_params(request, organization) except NoProjects: return Response([]) vitals = [ vital.lower() for vital in request.GET.getlist("vital", []) ] if len(vitals) == 0: raise ParseError(detail="Need to pass at least one vital") selected_columns = [] aliases = {} for vital in vitals: if vital not in self.VITALS: raise ParseError(detail=f"{vital} is not a valid vital") aliases[vital] = [] for index, threshold in enumerate( self.VITALS[vital]["thresholds"]): column = f"count_at_least({vital}, {threshold})" # Order aliases for later calculation aliases[vital].append(get_function_alias(column)) selected_columns.append(column) selected_columns.append(f"p75({vital})") with self.handle_query_errors(): events_results = discover.query( selected_columns=selected_columns, query=request.GET.get("query"), params=params, # Results should only ever have 1 result limit=1, referrer="api.events.vitals", auto_fields=True, auto_aggregations=False, use_aggregate_conditions=False, use_snql=features.has("organizations:performance-use-snql", organization, actor=request.user), ) results = {} if len(events_results["data"]) == 1: event_data = events_results["data"][0] for vital in vitals: groups = len(aliases[vital]) results[vital] = {} total = 0 # Go backwards so that we can subtract and get the running total for i in range(groups - 1, -1, -1): count = event_data[aliases[vital][i]] group_count = 0 if count is None else count - total results[vital][self.LABELS[i]] = group_count total += group_count results[vital]["total"] = total results[vital]["p75"] = event_data.get( get_function_alias(f"p75({vital})")) return Response(results)
def histogram_query( fields, user_query, params, num_buckets, precision=0, min_value=None, max_value=None, data_filter=None, referrer=None, group_by=None, extra_conditions=None, normalize_results=True, ): """ API for generating histograms for numeric columns. A multihistogram is possible only if the columns are all array columns. Array columns are columns whose values are nested arrays. Measurements and span op breakdowns are examples of array columns. The resulting histograms will have their bins aligned. :param [str] fields: The list of fields for which you want to generate histograms for. :param str user_query: Filter query string to create conditions from. :param {str: str} params: Filtering parameters with start, end, project_id, environment :param int num_buckets: The number of buckets the histogram should contain. :param int precision: The number of decimal places to preserve, default 0. :param float min_value: The minimum value allowed to be in the histogram. If left unspecified, it is queried using `user_query` and `params`. :param float max_value: The maximum value allowed to be in the histogram. If left unspecified, it is queried using `user_query` and `params`. :param str data_filter: Indicate the filter strategy to be applied to the data. :param [str] group_by: Experimental. Allows additional grouping to serve multifacet histograms. :param [str] extra_conditions: Adds any additional conditions to the histogram query that aren't received from params. :param bool normalize_results: Indicate whether to normalize the results by column into bins. """ multiplier = int(10**precision) if max_value is not None: # We want the specified max_value to be exclusive, and the queried max_value # to be inclusive. So we adjust the specified max_value using the multiplier. max_value -= 0.1 / multiplier min_value, max_value = find_histogram_min_max(fields, min_value, max_value, user_query, params, data_filter) key_column = None array_column = None histogram_function = None conditions = [] if len(fields) > 1: array_column = check_multihistogram_fields(fields) if array_column == "measurements": key_column = "array_join(measurements_key)" histogram_function = get_measurement_name elif array_column == "span_op_breakdowns": key_column = "array_join(span_op_breakdowns_key)" histogram_function = get_span_op_breakdown_name else: raise InvalidSearchQuery( "multihistogram expected either all measurements or all breakdowns" ) key_alias = get_function_alias(key_column) field_names = [histogram_function(field) for field in fields] conditions.append([key_alias, "IN", field_names]) if extra_conditions: conditions.append(extra_conditions) histogram_params = find_histogram_params(num_buckets, min_value, max_value, multiplier) histogram_column = get_histogram_column(fields, key_column, histogram_params, array_column) histogram_alias = get_function_alias(histogram_column) if min_value is None or max_value is None: return normalize_histogram_results(fields, key_column, histogram_params, {"data": []}, array_column) # make sure to bound the bins to get the desired range of results if min_value is not None: min_bin = histogram_params.start_offset conditions.append([histogram_alias, ">=", min_bin]) if max_value is not None: max_bin = histogram_params.start_offset + histogram_params.bucket_size * num_buckets conditions.append([histogram_alias, "<=", max_bin]) columns = [] if key_column is None else [key_column] limit = len(fields) * num_buckets histogram_query = prepare_discover_query( selected_columns=columns + [histogram_column, "count()"], conditions=conditions, query=user_query, params=params, orderby=[histogram_alias], functions_acl=["array_join", "histogram"], ) snuba_filter = histogram_query.filter if group_by: snuba_filter.groupby += group_by result = raw_query( start=snuba_filter.start, end=snuba_filter.end, groupby=snuba_filter.groupby, conditions=snuba_filter.conditions, aggregations=snuba_filter.aggregations, selected_columns=snuba_filter.selected_columns, filter_keys=snuba_filter.filter_keys, having=snuba_filter.having, orderby=snuba_filter.orderby, dataset=Dataset.Discover, limit=limit, referrer=referrer, ) results = transform_results( result, histogram_query.fields["functions"], histogram_query.columns, snuba_filter, ) if not normalize_results: return results return normalize_histogram_results(fields, key_column, histogram_params, results, array_column)
def prepare_discover_query( selected_columns, query, params, orderby=None, auto_fields=False, auto_aggregations=False, use_aggregate_conditions=False, conditions=None, functions_acl=None, ): with sentry_sdk.start_span(op="discover.discover", description="query.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) if not use_aggregate_conditions: assert ( not auto_aggregations ), "Auto aggregations cannot be used without enabling aggregate conditions" snuba_filter.having = [] with sentry_sdk.start_span(op="discover.discover", description="query.field_translations"): if orderby is not None: orderby = list(orderby) if isinstance(orderby, (list, tuple)) else [orderby] snuba_filter.orderby = [get_function_alias(o) for o in orderby] resolved_fields = resolve_field_list( selected_columns, snuba_filter, auto_fields=auto_fields, auto_aggregations=auto_aggregations, functions_acl=functions_acl, ) snuba_filter.update_with(resolved_fields) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_filter.having: # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure # any referenced functions are in the aggregations. error_extra = ", and could not be automatically added" if auto_aggregations else "" if isinstance(having_clause[0], (list, tuple)): # Functions are of the form [fn, [args]] args_to_check = [[having_clause[0]]] conditions_not_in_aggregations = [] while len(args_to_check) > 0: args = args_to_check.pop() for arg in args: if arg[0] in [SNUBA_AND, SNUBA_OR]: args_to_check.extend(arg[1]) # Only need to iterate on arg[1] if its a list elif isinstance(arg[1], (list, tuple)): alias = arg[1][0] found = any( alias == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: conditions_not_in_aggregations.append(alias) if len(conditions_not_in_aggregations) > 0: raise InvalidSearchQuery( "Aggregate(s) {} used in a condition but are not in the selected columns{}." .format( ", ".join(conditions_not_in_aggregations), error_extra, )) else: found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: raise InvalidSearchQuery( "Aggregate {} used in a condition but is not a selected column{}." .format( having_clause[0], error_extra, )) if conditions is not None: snuba_filter.conditions.extend(conditions) return PreparedQuery(snuba_filter, translated_columns, resolved_fields)
def find_histogram_min_max(fields, min_value, max_value, user_query, params, data_filter=None): """ Find the min/max value of the specified fields. If either min/max is already specified, it will be used and not queried for. :param [str] fields: The list of fields for which you want to generate the histograms for. :param float min_value: The minimum value allowed to be in the histogram. If left unspecified, it is queried using `user_query` and `params`. :param float max_value: The maximum value allowed to be in the histogram. If left unspecified, it is queried using `user_query` and `params`. :param str user_query: Filter query string to create conditions from. :param {str: str} params: Filtering parameters with start, end, project_id, environment :param str data_filter: Indicate the filter strategy to be applied to the data. """ if min_value is not None and max_value is not None: return min_value, max_value min_columns = [] max_columns = [] quartiles = [] for field in fields: if min_value is None: min_columns.append(f"min({field})") if max_value is None: max_columns.append(f"max({field})") if data_filter == "exclude_outliers": quartiles.append(f"percentile({field}, 0.25)") quartiles.append(f"percentile({field}, 0.75)") results = query( selected_columns=min_columns + max_columns + quartiles, query=user_query, params=params, limit=1, referrer="api.organization-events-histogram-min-max", ) data = results.get("data") # there should be exactly 1 row in the results, but if something went wrong here, # we force the min/max to be None to coerce an empty histogram if data is None or len(data) != 1: return None, None row = data[0] if min_value is None: min_values = [ row[get_function_alias(column)] for column in min_columns ] min_values = list(filter(lambda v: v is not None, min_values)) min_value = min(min_values) if min_values else None if max_value is not None and min_value is not None: # max_value was provided by the user, and min_value was queried. # If min_value > max_value, then we adjust min_value with respect to # max_value. The rationale is that if the user provided max_value, # then any and all data above max_value should be ignored since it is # and upper bound. min_value = min([max_value, min_value]) if max_value is None: max_values = [ row[get_function_alias(column)] for column in max_columns ] max_values = list(filter(lambda v: v is not None, max_values)) max_value = max(max_values) if max_values else None fences = [] if data_filter == "exclude_outliers": for field in fields: q1_alias = get_function_alias(f"percentile({field}, 0.25)") q3_alias = get_function_alias(f"percentile({field}, 0.75)") first_quartile = row[q1_alias] third_quartile = row[q3_alias] if (first_quartile is None or third_quartile is None or math.isnan(first_quartile) or math.isnan(third_quartile)): continue interquartile_range = abs(third_quartile - first_quartile) upper_outer_fence = third_quartile + 3 * interquartile_range fences.append(upper_outer_fence) max_fence_value = max(fences) if fences else None candidates = [max_fence_value, max_value] candidates = list(filter(lambda v: v is not None, candidates)) max_value = min(candidates) if candidates else None if max_value is not None and min_value is not None: # min_value may be either queried or provided by the user. max_value was queried. # If min_value > max_value, then max_value should be adjusted with respect to # min_value, since min_value is a lower bound, and any and all data below # min_value should be ignored. max_value = max([max_value, min_value]) return min_value, max_value
def validate(self, data): if not data.get("id"): keys = set(data.keys()) if self.required_for_create - keys: raise serializers.ValidationError({ "fields": "fields are required during creation.", "conditions": "conditions are required during creation.", }) # Validate the query that would be created when run. conditions = self._get_attr(data, "conditions", "") fields = self._get_attr(data, "fields", []).copy() orderby = self._get_attr(data, "orderby", "") equations, fields = categorize_columns(fields) is_table = is_table_display_type(self.context.get("displayType")) if equations is not None: try: resolved_equations, _, _ = resolve_equation_list( equations, fields, auto_add=not is_table, aggregates_only=not is_table, ) except (InvalidSearchQuery, ArithmeticError) as err: raise serializers.ValidationError( {"fields": f"Invalid fields: {err}"}) else: resolved_equations = [] try: parse_search_query(conditions) except InvalidSearchQuery as err: # We don't know if the widget that this query belongs to is an # Issue widget or Discover widget. Pass the error back to the # Widget serializer to decide if whether or not to raise this # error based on the Widget's type data["issue_query_error"] = { "conditions": [f"Invalid conditions: {err}"] } try: # When using the eps/epm functions, they require an interval argument # or to provide the start/end so that the interval can be computed. # This uses a hard coded start/end to ensure the validation succeeds # since the values themselves don't matter. params = { "start": datetime.now() - timedelta(days=1), "end": datetime.now(), "project_id": [p.id for p in self.context.get("projects")], "organization_id": self.context.get("organization").id, } snuba_filter = get_filter(conditions, params=params) except InvalidSearchQuery as err: data["discover_query_error"] = { "conditions": [f"Invalid conditions: {err}"] } return data if orderby: snuba_filter.orderby = get_function_alias(orderby) try: resolve_field_list(fields, snuba_filter, resolved_equations=resolved_equations) except InvalidSearchQuery as err: # We don't know if the widget that this query belongs to is an # Issue widget or Discover widget. Pass the error back to the # Widget serializer to decide if whether or not to raise this # error based on the Widget's type data["discover_query_error"] = {"fields": f"Invalid fields: {err}"} return data
def get_event_stats_data( self, request, organization, get_event_stats, top_events=0, query_column="count()", params=None, query=None, allow_partial_buckets=False, ): with self.handle_query_errors(): with sentry_sdk.start_span( op="discover.endpoint", description="base.stats_query_creation"): columns = request.GET.getlist("yAxis", [query_column]) if query is None: query = request.GET.get("query") if params is None: try: # events-stats is still used by events v1 which doesn't require global views params = self.get_snuba_params( request, organization, check_global_views=False) except NoProjects: return {"data": []} rollup = get_rollup_from_request( request, params, default_interval=None, error=InvalidSearchQuery( "Your interval and date range would create too many results. " "Use a larger interval, or a smaller date range."), top_events=top_events, ) # Backwards compatibility for incidents which uses the old # column aliases as it straddles both versions of events/discover. # We will need these aliases until discover2 flags are enabled for all # users. # We need these rollup columns to generate correct events-stats results column_map = { "user_count": "count_unique(user)", "event_count": "count()", "epm()": "epm(%d)" % rollup, "eps()": "eps(%d)" % rollup, "tpm()": "tpm(%d)" % rollup, "tps()": "tps(%d)" % rollup, } query_columns = [ column_map.get(column, column) for column in columns ] with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_query"): result = get_event_stats(query_columns, query, params, rollup) serializer = SnubaTSResultSerializer(organization, None, request.user) with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_serialization"): # When the request is for top_events, result can be a SnubaTSResult in the event that # there were no top events found. In this case, result contains a zerofilled series # that acts as a placeholder. if top_events > 0 and isinstance(result, dict): results = {} for key, event_result in result.items(): if len(query_columns) > 1: results[key] = self.serialize_multiple_axis( serializer, event_result, columns, query_columns, allow_partial_buckets) else: # Need to get function alias if count is a field, but not the axis results[key] = serializer.serialize( event_result, column=get_function_alias(query_columns[0]), allow_partial_buckets=allow_partial_buckets, ) return results elif len(query_columns) > 1: return self.serialize_multiple_axis(serializer, result, columns, query_columns, allow_partial_buckets) else: return serializer.serialize( result, allow_partial_buckets=allow_partial_buckets)