Пример #1
0
def timeseries_query(selected_columns, query, params, rollup, reference_event=None, referrer=None):
    """
    High-level API for doing arbitrary user timeseries queries against events.

    This function operates on the public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    This function is intended to only get timeseries based
    results and thus requires the `rollup` parameter.

    Returns a SnubaTSResult object that has been zerofilled in
    case of gaps.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment,
    rollup (int) The bucket width in seconds
    reference_event (ReferenceEvent) A reference event object. Used to generate additional
                    conditions based on the provided reference.
    referrer (str|None) A referrer string to help locate the origin of this query.
    """
    snuba_filter = get_filter(query, params)
    snuba_args = {
        "start": snuba_filter.start,
        "end": snuba_filter.end,
        "conditions": snuba_filter.conditions,
        "filter_keys": snuba_filter.filter_keys,
    }
    if not snuba_args["start"] and not snuba_args["end"]:
        raise InvalidSearchQuery("Cannot get timeseries result without a start and end.")

    snuba_args.update(resolve_field_list(selected_columns, snuba_args, auto_fields=False))
    if reference_event:
        ref_conditions = create_reference_event_conditions(reference_event)
        if ref_conditions:
            snuba_args["conditions"].extend(ref_conditions)

    # Resolve the public aliases into the discover dataset names.
    snuba_args, _ = resolve_discover_aliases(snuba_args)
    if not snuba_args["aggregations"]:
        raise InvalidSearchQuery("Cannot get timeseries result with no aggregation.")

    # Change the alias of the first aggregation to count. This ensures compatibility
    # with other parts of the timeseries endpoint expectations
    if len(snuba_args["aggregations"]) == 1:
        snuba_args["aggregations"][0][2] = "count"

    result = raw_query(
        aggregations=snuba_args.get("aggregations"),
        conditions=snuba_args.get("conditions"),
        filter_keys=snuba_args.get("filter_keys"),
        start=snuba_args.get("start"),
        end=snuba_args.get("end"),
        rollup=rollup,
        orderby="time",
        groupby=["time"],
        dataset=Dataset.Discover,
        limit=10000,
        referrer=referrer,
    )
    result = zerofill(result["data"], snuba_args["start"], snuba_args["end"], rollup, "time")

    return SnubaTSResult({"data": result}, snuba_filter.start, snuba_filter.end, rollup)
Пример #2
0
def find_histogram_buckets(field, params, conditions):
    match = is_function(field)
    if not match:
        raise InvalidSearchQuery(
            u"received {}, expected histogram function".format(field))

    columns = [
        c.strip() for c in match.group("columns").split(",")
        if len(c.strip()) > 0
    ]

    if len(columns) != 2:
        raise InvalidSearchQuery(
            u"histogram(...) expects 2 column arguments, received {:g} arguments"
            .format(len(columns)))

    column = columns[0]
    # TODO evanh: This can be expanded to more fields at a later date, for now keep this limited.
    if column != "transaction.duration":
        raise InvalidSearchQuery(
            "histogram(...) can only be used with the transaction.duration column"
        )

    try:
        num_buckets = int(columns[1])
        if num_buckets < 1 or num_buckets > 500:
            raise Exception()
    except Exception:
        raise InvalidSearchQuery(
            u"histogram(...) requires a bucket value between 1 and 500, not {}"
            .format(columns[1]))

    max_alias = u"max_{}".format(column)
    min_alias = u"min_{}".format(column)

    conditions = deepcopy(conditions) if conditions else []
    found = False
    for cond in conditions:
        if len(cond) == 3 and (cond[0], cond[1],
                               cond[2]) == ("event.type", "=", "transaction"):
            found = True
            break
    if not found:
        conditions.append(["event.type", "=", "transaction"])
    snuba_filter = eventstore.Filter(conditions=conditions)
    translated_args, _ = resolve_discover_aliases(snuba_filter)

    results = raw_query(
        filter_keys={"project_id": params.get("project_id")},
        start=params.get("start"),
        end=params.get("end"),
        dataset=Dataset.Discover,
        conditions=translated_args.conditions,
        aggregations=[["max", "duration", max_alias],
                      ["min", "duration", min_alias]],
    )
    if len(results["data"]) != 1:
        # If there are no transactions, so no max duration, return one empty bucket
        return "histogram({}, 1, 1, 0)".format(column)

    bucket_min = results["data"][0][min_alias]
    bucket_max = results["data"][0][max_alias]
    if bucket_max == 0:
        raise InvalidSearchQuery(
            u"Cannot calculate histogram for {}".format(field))
    bucket_size = ceil((bucket_max - bucket_min) / float(num_buckets))
    if bucket_size == 0.0:
        bucket_size = 1.0

    # Determine the first bucket that will show up in our results so that we can
    # zerofill correctly.
    offset = int(floor(bucket_min / bucket_size) * bucket_size)

    return "histogram({}, {:g}, {:.0f}, {:.0f})".format(
        column, num_buckets, bucket_size, offset)
Пример #3
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    referrer=None,
    auto_fields=False,
    auto_aggregations=False,
    use_aggregate_conditions=False,
    conditions=None,
    functions_acl=None,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    auto_aggregations (bool) Whether aggregates should be added automatically if they're used
                    in conditions, and there's at least one aggregate already.
    use_aggregate_conditions (bool) Set to true if aggregates conditions should be used at all.
    conditions (Sequence[any]) List of conditions that are passed directly to snuba without
                    any additional processing.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")
    else:
        # We clobber this value throughout this code, so copy the value
        selected_columns = selected_columns[:]

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.filter_transform") as span:
        span.set_data("query", query)

        snuba_filter = get_filter(query, params)
        if not use_aggregate_conditions:
            assert (
                not auto_aggregations
            ), "Auto aggregations cannot be used without enabling aggregate conditions"
            snuba_filter.having = []

    # We need to run a separate query to be able to properly bucket the values for the histogram
    # Do that here, and format the bucket number in to the columns before passing it through
    # to event search.
    idx = 0
    function_translations = {}
    for col in selected_columns:
        if col.startswith("histogram("):
            with sentry_sdk.start_span(
                    op="discover.discover",
                    description="query.histogram_calculation") as span:
                span.set_data("histogram", col)
                histogram_column = find_histogram_buckets(
                    col, params, snuba_filter.conditions)
                selected_columns[idx] = histogram_column
                snuba_name = get_function_alias(histogram_column)
                sentry_name = get_function_alias(col)
                function_translations[snuba_name] = sentry_name
                # Since we're completely renaming the histogram function, we need to also check if we are
                # ordering by the histogram values, and change that.
                if orderby is not None:
                    orderby = list(orderby) if isinstance(
                        orderby, (list, tuple)) else [orderby]
                    for i, ordering in enumerate(orderby):
                        if sentry_name == ordering.lstrip("-"):
                            ordering = "{}{}".format(
                                "-" if ordering.startswith("-") else "",
                                snuba_name)
                            orderby[i] = ordering

            break

        idx += 1

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.field_translations"):
        if orderby is not None:
            orderby = list(orderby) if isinstance(orderby,
                                                  (list,
                                                   tuple)) else [orderby]
            snuba_filter.orderby = [get_function_alias(o) for o in orderby]

        resolved_fields = resolve_field_list(
            selected_columns,
            snuba_filter,
            auto_fields=auto_fields,
            auto_aggregations=auto_aggregations,
            functions_acl=functions_acl,
        )

        snuba_filter.update_with(resolved_fields)

        # Resolve the public aliases into the discover dataset names.
        snuba_filter, translated_columns = resolve_discover_aliases(
            snuba_filter, function_translations)

        # Make sure that any aggregate conditions are also in the selected columns
        for having_clause in snuba_filter.having:
            # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure
            # any referenced functions are in the aggregations.
            error_extra = u", and could not be automatically added" if auto_aggregations else u""
            if isinstance(having_clause[0], (list, tuple)):
                # Functions are of the form [fn, [args]]
                args_to_check = [[having_clause[0]]]
                conditions_not_in_aggregations = []
                while len(args_to_check) > 0:
                    args = args_to_check.pop()
                    for arg in args:
                        if arg[0] in [SNUBA_AND, SNUBA_OR]:
                            args_to_check.extend(arg[1])
                        # Only need to iterate on arg[1] if its a list
                        elif isinstance(arg[1], (list, tuple)):
                            alias = arg[1][0]
                            found = any(
                                alias == agg_clause[-1]
                                for agg_clause in snuba_filter.aggregations)
                            if not found:
                                conditions_not_in_aggregations.append(alias)

                if len(conditions_not_in_aggregations) > 0:
                    raise InvalidSearchQuery(
                        u"Aggregate(s) {} used in a condition but are not in the selected columns{}."
                        .format(
                            ", ".join(conditions_not_in_aggregations),
                            error_extra,
                        ))
            else:
                found = any(having_clause[0] == agg_clause[-1]
                            for agg_clause in snuba_filter.aggregations)
                if not found:
                    raise InvalidSearchQuery(
                        u"Aggregate {} used in a condition but is not a selected column{}."
                        .format(
                            having_clause[0],
                            error_extra,
                        ))

        if conditions is not None:
            snuba_filter.conditions.extend(conditions)

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.snuba_query"):
        result = raw_query(
            start=snuba_filter.start,
            end=snuba_filter.end,
            groupby=snuba_filter.groupby,
            conditions=snuba_filter.conditions,
            aggregations=snuba_filter.aggregations,
            selected_columns=snuba_filter.selected_columns,
            filter_keys=snuba_filter.filter_keys,
            having=snuba_filter.having,
            orderby=snuba_filter.orderby,
            dataset=Dataset.Discover,
            limit=limit,
            offset=offset,
            referrer=referrer,
        )

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.transform_results") as span:
        span.set_data("result_count", len(result.get("data", [])))
        return transform_results(result, resolved_fields["functions"],
                                 translated_columns, snuba_filter,
                                 selected_columns)
Пример #4
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    reference_event=None,
    referrer=None,
    auto_fields=False,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    reference_event (ReferenceEvent) A reference event object. Used to generate additional
                    conditions based on the provided reference.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    """
    snuba_filter = get_filter(query, params)

    # TODO(mark) Refactor the need for this translation shim once all of
    # discover is using this module. Remember to update all the functions
    # in this module.
    snuba_args = {
        "start": snuba_filter.start,
        "end": snuba_filter.end,
        "conditions": snuba_filter.conditions,
        "filter_keys": snuba_filter.filter_keys,
        "orderby": orderby,
    }
    if not selected_columns:
        raise InvalidSearchQuery("No fields provided")
    snuba_args.update(resolve_field_list(selected_columns, snuba_args, auto_fields=auto_fields))

    if reference_event:
        ref_conditions = create_reference_event_conditions(reference_event)
        if ref_conditions:
            snuba_args["conditions"].extend(ref_conditions)

    # Resolve the public aliases into the discover dataset names.
    snuba_args, translated_columns = resolve_discover_aliases(snuba_args)

    result = raw_query(
        start=snuba_args.get("start"),
        end=snuba_args.get("end"),
        groupby=snuba_args.get("groupby"),
        conditions=snuba_args.get("conditions"),
        aggregations=snuba_args.get("aggregations"),
        selected_columns=snuba_args.get("selected_columns"),
        filter_keys=snuba_args.get("filter_keys"),
        orderby=snuba_args.get("orderby"),
        dataset=Dataset.Discover,
        limit=limit,
        offset=offset,
        referrer=referrer,
    )

    return transform_results(result, translated_columns, snuba_args)
Пример #5
0
def convert_status_value(value, projects, user, environments):
    try:
        return parse_status_value(value)
    except ValueError:
        raise InvalidSearchQuery("invalid status value of '{}'".format(value))
Пример #6
0
 def visit_boolean_operator(self, node, children):
     raise InvalidSearchQuery(
         'Boolean statements containing "OR" or "AND" are not supported in this search'
     )
Пример #7
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    reference_event=None,
    referrer=None,
    auto_fields=False,
    use_aggregate_conditions=False,
    conditions=None,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    reference_event (ReferenceEvent) A reference event object. Used to generate additional
                    conditions based on the provided reference.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    conditions (Sequence[any]) List of conditions that are passed directly to snuba without
                    any additional processing.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.filter_transform") as span:
        span.set_data("query", query)
        # TODO(evanh): These can be removed once we migrate the frontend / saved queries
        # to use the new function values
        selected_columns, function_translations = transform_deprecated_functions_in_columns(
            selected_columns)
        query = transform_deprecated_functions_in_query(query)

        snuba_filter = get_filter(query, params)
        if not use_aggregate_conditions:
            snuba_filter.having = []

    # We need to run a separate query to be able to properly bucket the values for the histogram
    # Do that here, and format the bucket number in to the columns before passing it through
    # to event search.
    idx = 0
    for col in selected_columns:
        if col.startswith("histogram("):
            with sentry_sdk.start_span(
                    op="discover.discover",
                    description="query.histogram_calculation") as span:
                span.set_data("histogram", col)
                histogram_column = find_histogram_buckets(
                    col, params, snuba_filter.conditions)
                selected_columns[idx] = histogram_column
                function_translations[get_function_alias(
                    histogram_column)] = get_function_alias(col)

            break

        idx += 1

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.field_translations"):
        # Check to see if we are ordering by any functions and convert the orderby to be the correct alias.
        if orderby:
            orderby = orderby if isinstance(orderby,
                                            (list, tuple)) else [orderby]
            new_orderby = []
            for ordering in orderby:
                is_reversed = ordering.startswith("-")
                ordering = ordering.lstrip("-")
                for snuba_name, sentry_name in six.iteritems(
                        function_translations):
                    if sentry_name == ordering:
                        ordering = snuba_name
                        break

                ordering = "{}{}".format("-" if is_reversed else "", ordering)
                new_orderby.append(ordering)

            snuba_filter.orderby = new_orderby

        snuba_filter.update_with(
            resolve_field_list(selected_columns,
                               snuba_filter,
                               auto_fields=auto_fields))

        if reference_event:
            ref_conditions = create_reference_event_conditions(reference_event)
            if ref_conditions:
                snuba_filter.conditions.extend(ref_conditions)

        # Resolve the public aliases into the discover dataset names.
        snuba_filter, translated_columns = resolve_discover_aliases(
            snuba_filter, function_translations)

        # Make sure that any aggregate conditions are also in the selected columns
        for having_clause in snuba_filter.having:
            found = any(having_clause[0] == agg_clause[-1]
                        for agg_clause in snuba_filter.aggregations)
            if not found:
                raise InvalidSearchQuery(
                    u"Aggregate {} used in a condition but is not a selected column."
                    .format(having_clause[0]))

        if conditions is not None:
            snuba_filter.conditions.extend(conditions)

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.snuba_query"):
        result = raw_query(
            start=snuba_filter.start,
            end=snuba_filter.end,
            groupby=snuba_filter.groupby,
            conditions=snuba_filter.conditions,
            aggregations=snuba_filter.aggregations,
            selected_columns=snuba_filter.selected_columns,
            filter_keys=snuba_filter.filter_keys,
            having=snuba_filter.having,
            orderby=snuba_filter.orderby,
            dataset=Dataset.Discover,
            limit=limit,
            offset=offset,
            referrer=referrer,
        )

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.transform_results") as span:
        span.set_data("result_count", len(result.get("data", [])))
        return transform_results(result, translated_columns, snuba_filter,
                                 selected_columns)
Пример #8
0
def find_histogram_buckets(field, params, conditions):
    match = is_function(field)
    if not match:
        raise InvalidSearchQuery(
            u"received {}, expected histogram function".format(field))

    columns = [
        c.strip() for c in match.group("columns").split(",")
        if len(c.strip()) > 0
    ]

    if len(columns) != 2:
        raise InvalidSearchQuery(
            u"histogram(...) expects 2 column arguments, received {:g} arguments"
            .format(len(columns)))

    column = columns[0]
    # TODO evanh: This can be expanded to more fields at a later date, for now keep this limited.
    if column != "transaction.duration":
        raise InvalidSearchQuery(
            "histogram(...) can only be used with the transaction.duration column"
        )

    try:
        num_buckets = int(columns[1])
        if num_buckets < 1 or num_buckets > 500:
            raise Exception()
    except Exception:
        raise InvalidSearchQuery(
            u"histogram(...) requires a bucket value between 1 and 500, not {}"
            .format(columns[1]))

    alias = u"max_{}".format(column)

    conditions = deepcopy(conditions) if conditions else []
    found = False
    for cond in conditions:
        if (cond[0], cond[1], cond[2]) == ("event.type", "=", "transaction"):
            found = True
    if not found:
        conditions.append(["event.type", "=", "transaction"])
    translated_args, _ = resolve_discover_aliases({"conditions": conditions})

    results = raw_query(
        filter_keys={"project_id": params.get("project_id")},
        start=params.get("start"),
        end=params.get("end"),
        dataset=Dataset.Discover,
        conditions=translated_args["conditions"],
        aggregations=[["max", "duration", alias]],
    )
    if len(results["data"]) != 1:
        # If there are no transactions, so no max duration, return one empty bucket
        return "histogram({}, 1, 1)".format(column)

    bucket_max = results["data"][0][alias]
    if bucket_max == 0:
        raise InvalidSearchQuery(
            u"Cannot calculate histogram for {}".format(field))

    bucket_number = ceil(bucket_max / float(num_buckets))

    return "histogram({}, {:g}, {:g})".format(column, num_buckets,
                                              bucket_number)
Пример #9
0
    def query(
        self,
        projects,
        retention_window_start,
        group_queryset,
        environments,
        sort_by,
        limit,
        cursor,
        count_hits,
        paginator_options,
        search_filters,
        date_from,
        date_to,
        max_hits=None,
    ):

        now = timezone.now()
        end = None
        end_params = [
            _f for _f in
            [date_to, get_search_filter(search_filters, "date", "<")] if _f
        ]
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            metrics.incr("snuba.search.postgres_only")

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (cursor is None and sort_by == "date" and
                    # This handles tags and date parameters for search filters.
                    not [
                        sf for sf in search_filters if sf.key.name not in
                        self.postgres_only_fields.union(["date"])
                    ]):
                group_queryset = group_queryset.order_by("-last_seen")
                paginator = DateTimePaginator(group_queryset, "-last_seen",
                                              **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit,
                                            cursor,
                                            count_hits=count_hits,
                                            max_hits=max_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max([
            _f for _f in [retention_window_start, now - timedelta(days=90)]
            if _f
        ])
        start_params = [
            date_from, retention_date,
            get_search_filter(search_filters, "date", ">")
        ]
        start = max([_f for _f in start_params if _f])
        end = max([retention_date, end])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return self.empty_result

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return self.empty_result

        # This search is specific to Inbox. If we're using inbox sort and only querying
        # postgres then we can use this sort method. Otherwise if we need to go to Snuba,
        # fail.
        if (sort_by == "inbox"
                and get_search_filter(search_filters, "for_review", "=")
                # This handles tags and date parameters for search filters.
                and not [
                    sf for sf in search_filters if sf.key.name not in
                    self.postgres_only_fields.union(["date"])
                ]):
            # We just filter on `GroupInbox.date_added` here, and don't filter by date
            # on the group. This keeps the query simpler and faster in some edge cases,
            # and date_added is a good enough proxy when we're using this sort.
            group_queryset = group_queryset.filter(
                groupinbox__date_added__gte=start,
                groupinbox__date_added__lte=end,
            )
            group_queryset = group_queryset.extra(select={
                "inbox_date":
                "sentry_groupinbox.date_added"
            }, ).order_by("-inbox_date")
            paginator = DateTimePaginator(group_queryset, "-inbox_date",
                                          **paginator_options)
            return paginator.get_result(limit,
                                        cursor,
                                        count_hits=count_hits,
                                        max_hits=max_hits)

        if sort_by == "inbox":
            raise InvalidSearchQuery(
                f"Sort key '{sort_by}' only supported for inbox search")

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get("snuba.search.max-pre-snuba-candidates")

        with sentry_sdk.start_span(op="snuba_group_query") as span:
            group_ids = list(
                group_queryset.values_list("id",
                                           flat=True)[:max_candidates + 1])
            span.set_data("Max Candidates", max_candidates)
            span.set_data("Result Size", len(group_ids))
        metrics.timing("snuba.search.num_candidates", len(group_ids))

        too_many_candidates = False
        if not group_ids:
            # no matches could possibly be found from this point on
            metrics.incr("snuba.search.no_candidates", skip_internal=False)
            return self.empty_result
        elif len(group_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr("snuba.search.too_many_candidates",
                         skip_internal=False)
            too_many_candidates = True
            group_ids = []

        sort_field = self.sort_strategies[sort_by]
        chunk_growth = options.get("snuba.search.chunk-growth-rate")
        max_chunk_size = options.get("snuba.search.max-chunk-size")
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = self.calculate_hits(
            group_ids,
            too_many_candidates,
            sort_field,
            projects,
            retention_window_start,
            group_queryset,
            environments,
            sort_by,
            limit,
            cursor,
            count_hits,
            paginator_options,
            search_filters,
            start,
            end,
        )
        if count_hits and hits == 0:
            return self.empty_result

        paginator_results = self.empty_result
        result_groups = []
        result_group_ids = set()

        max_time = options.get("snuba.search.max-total-chunk-time-seconds")
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have group_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(group_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = self.snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                sort_field=sort_field,
                cursor=cursor,
                group_ids=group_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing("snuba.search.num_snuba_results", len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if group_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the group_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid
                            for gid, _ in snuba_groups]).values_list("id",
                                                                     flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options).get_result(limit,
                                                cursor,
                                                known_hits=hits,
                                                max_hits=max_hits)

            if group_ids or len(
                    paginator_results.results) >= limit or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev
                                   or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing("snuba.search.num_chunks", num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]

        return paginator_results
Пример #10
0
    def get_event_stats_data(
        self,
        request,
        organization,
        get_event_stats,
        top_events=0,
        query_column="count()",
        params=None,
        query=None,
        allow_partial_buckets=False,
    ):
        with self.handle_query_errors():
            with sentry_sdk.start_span(
                op="discover.endpoint", description="base.stats_query_creation"
            ):
                columns = request.GET.getlist("yAxis", [query_column])
                if query is None:
                    query = request.GET.get("query")
                if params is None:
                    try:
                        # events-stats is still used by events v1 which doesn't require global views
                        params = self.get_snuba_params(
                            request, organization, check_global_views=False
                        )
                    except NoProjects:
                        return {"data": []}

                rollup = get_rollup_from_request(
                    request,
                    params,
                    default_interval=None,
                    error=InvalidSearchQuery(
                        "Your interval and date range would create too many results. "
                        "Use a larger interval, or a smaller date range."
                    ),
                    top_events=top_events,
                )
                # Backwards compatibility for incidents which uses the old
                # column aliases as it straddles both versions of events/discover.
                # We will need these aliases until discover2 flags are enabled for all
                # users.
                # We need these rollup columns to generate correct events-stats results
                column_map = {
                    "user_count": "count_unique(user)",
                    "event_count": "count()",
                    "epm()": "epm(%d)" % rollup,
                    "eps()": "eps(%d)" % rollup,
                    "tpm()": "tpm(%d)" % rollup,
                    "tps()": "tps(%d)" % rollup,
                }
                query_columns = [column_map.get(column, column) for column in columns]
            with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_query"):
                result = get_event_stats(query_columns, query, params, rollup)

        serializer = SnubaTSResultSerializer(organization, None, request.user)

        with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_serialization"):
            # When the request is for top_events, result can be a SnubaTSResult in the event that
            # there were no top events found. In this case, result contains a zerofilled series
            # that acts as a placeholder.
            if top_events > 0 and isinstance(result, dict):
                results = {}
                for key, event_result in result.items():
                    if len(query_columns) > 1:
                        results[key] = self.serialize_multiple_axis(
                            serializer, event_result, columns, query_columns, allow_partial_buckets
                        )
                    else:
                        # Need to get function alias if count is a field, but not the axis
                        results[key] = serializer.serialize(
                            event_result,
                            column=get_function_alias(query_columns[0]),
                            allow_partial_buckets=allow_partial_buckets,
                        )
                return results
            elif len(query_columns) > 1:
                return self.serialize_multiple_axis(
                    serializer, result, columns, query_columns, allow_partial_buckets
                )
            else:
                return serializer.serialize(result, allow_partial_buckets=allow_partial_buckets)
Пример #11
0
def histogram_query(
    fields,
    user_query,
    params,
    num_buckets,
    precision=0,
    min_value=None,
    max_value=None,
    data_filter=None,
    referrer=None,
):
    """
    API for generating histograms for numeric columns.

    A multihistogram is possible only if the columns are all measurements.
    The resulting histograms will have their bins aligned.

    :param [str] fields: The list of fields for which you want to generate histograms for.
    :param str user_query: Filter query string to create conditions from.
    :param {str: str} params: Filtering parameters with start, end, project_id, environment
    :param int num_buckets: The number of buckets the histogram should contain.
    :param int precision: The number of decimal places to preserve, default 0.
    :param float min_value: The minimum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param float max_value: The maximum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param str data_filter: Indicate the filter strategy to be applied to the data.
    """

    multiplier = int(10 ** precision)
    if max_value is not None:
        # We want the specified max_value to be exclusive, and the queried max_value
        # to be inclusive. So we adjust the specified max_value using the multiplier.
        max_value -= 0.1 / multiplier
    min_value, max_value = find_histogram_min_max(
        fields, min_value, max_value, user_query, params, data_filter
    )

    key_column = None
    conditions = []
    if len(fields) > 1:
        key_column = "array_join(measurements_key)"
        key_alias = get_function_alias(key_column)
        measurements = []
        for f in fields:
            measurement = get_measurement_name(f)
            if measurement is None:
                raise InvalidSearchQuery(f"multihistogram expected all measurements, received: {f}")
            measurements.append(measurement)
        conditions.append([key_alias, "IN", measurements])

    histogram_params = find_histogram_params(num_buckets, min_value, max_value, multiplier)
    histogram_column = get_histogram_column(fields, key_column, histogram_params)
    histogram_alias = get_function_alias(histogram_column)

    if min_value is None or max_value is None:
        return normalize_histogram_results(fields, key_column, histogram_params, {"data": []})
    # make sure to bound the bins to get the desired range of results
    if min_value is not None:
        min_bin = histogram_params.start_offset
        conditions.append([histogram_alias, ">=", min_bin])
    if max_value is not None:
        max_bin = histogram_params.start_offset + histogram_params.bucket_size * num_buckets
        conditions.append([histogram_alias, "<=", max_bin])

    columns = [] if key_column is None else [key_column]
    results = query(
        selected_columns=columns + [histogram_column, "count()"],
        conditions=conditions,
        query=user_query,
        params=params,
        orderby=[histogram_alias],
        limit=len(fields) * num_buckets,
        referrer=referrer,
        functions_acl=["array_join", "histogram"],
    )

    return normalize_histogram_results(fields, key_column, histogram_params, results)