示例#1
0
文件: backend.py 项目: mnoble/sentry
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the future, but apparently
        #       `retention_window_start` can be None?
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # pre-filter query
        candidate_hashes = dict(
            GroupHash.objects.filter(
                group__in=group_queryset
            ).values_list(
                'hash', 'group_id'
            )[:MAX_PRE_SNUBA_CANDIDATES + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_hashes))

        if not candidate_hashes:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates')
            return Paginator(Group.objects.none()).get_result()
        elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates')
            candidate_hashes = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        # {group_id: group_score, ...}
        snuba_groups = snuba_search(
            project_id=project.id,
            environment_id=environment and environment.id,
            tags=tags,
            start=start,
            end=end,
            sort=sort,
            extra_aggregations=extra_aggregations,
            score_fn=score_fn,
            candidate_hashes=candidate_hashes,
            **parameters
        )
        metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))

        if candidate_hashes:
            # pre-filtered candidates were passed down to Snuba,
            # so we're finished with filtering
            result_groups = snuba_groups.items()
        else:
            # pre-filtered candidates were *not* passed down to Snuba,
            # so we need to do post-filtering to verify Sentry DB predicates
            result_groups = []
            i = 0
            for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1):
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in chunk]
                ).values_list('id', flat=True)

                result_groups.extend(
                    (group_id, snuba_groups[group_id])
                    for group_id in filtered_group_ids
                )

            metrics.timing('snuba.search.num_post_filters', i)

        paginator_results = SequencePaginator(
            [(score, id) for (id, score) in result_groups],
            reverse=True,
            **paginator_options
        ).get_result(limit, cursor, count_hits=count_hits)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
示例#2
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[projects[0].organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} IN ({})'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                            ', '.join(['%s' for e in environments])
                        ),
                    ],
                    params=[environment.id for environment in environments],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if cursor is None \
                    and sort_by == 'date' \
                    and not tags \
                    and not environments \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit, cursor, count_hits=count_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        start = max(
            filter(None, [
                retention_date,
                parameters.get('date_from'),
            ])
        )

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
        candidate_ids = list(
            group_queryset.values_list('id', flat=True)[:max_candidates + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_ids))
        if not candidate_ids:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates', skip_internal=False)
            return EMPTY_RESULT
        elif len(candidate_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
            candidate_ids = None

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = None

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        if count_hits and candidate_ids is None:
            # If we have no candidates, get a random sample of groups matching
            # the snuba side of the query, and see how many of those pass the
            # post-filter in postgres. This should give us an estimate of the
            # total number of snuba matches that will be overall matches, which
            # we can use to get an estimate for X-Hits. Note no cursor, so we
            # are always estimating the total hits.

            # The number of samples required to achieve a certain error bound
            # with a certain confidence interval can be calculated from a
            # rearrangement of the normal approximation (Wald) confidence
            # interval formula:
            #
            # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
            #
            # Effectively if we want the estimate to be within +/- 10% of the
            # real value with 95% confidence, we would need (1.96^2 * p*(1-p))
            # / 0.1^2 samples. With a starting assumption of p=0.5 (this
            # requires the most samples) we would need 96 samples to achieve
            # +/-10% @ 95% confidence.

            sample_size = options.get('snuba.search.hits-sample-size')
            snuba_groups, snuba_total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                limit=sample_size,
                offset=0,
                get_sample=True,
                **parameters
            )
            snuba_count = len(snuba_groups)
            if snuba_count == 0:
                return EMPTY_RESULT
            else:
                filtered_count = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).count()

                hit_ratio = filtered_count / float(snuba_count)
                hits = int(hit_ratio * snuba_total)

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the candidate_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

                if count_hits:
                    if not more_results:
                        # We know we have got all possible groups from snuba and filtered
                        # them all down, so we have all hits.
                        # TODO this probably doesn't work because we could be on page N
                        # and not be including hits from previous pages.
                        hits = len(result_groups)
                    else:
                        # We also could have underestimated hits from our sample and have
                        # already seen more hits than the estimate, so make sure hits is
                        # at least as big as what we have seen.
                        hits = max(hits, len(result_groups))

            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, known_hits=hits)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
示例#3
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the future, but apparently
        #       `retention_window_start` can be None?
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        '{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        '{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # TODO: If the query didn't include anything to significantly filter
        # down the number of groups at this point ('first_release', 'query',
        # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
        # 'subscribed_by', 'active_at_from', or 'active_at_to') then this
        # queryset might return a *huge* number of groups. In this case, we
        # probably *don't* want to pass candidates down to Snuba, and rather we
        # want Snuba to do all the filtering/sorting it can and *then* apply
        # this queryset to the results from Snuba.
        #
        # However, if this did filter down the number of groups significantly,
        # then passing in candidates is, of course, valuable.
        #
        # Should we decide which way to handle it based on the number of
        # group_ids, the number of hashes? Or should we just always start the
        # query with Snuba? Something else?
        candidate_group_ids = list(group_queryset.values_list('id', flat=True))

        sort, extra_aggregations, calculate_cursor_for_group = sort_strategies[sort_by]

        group_data = do_search(
            project_id=project.id,
            environment_id=environment and environment.id,
            tags=tags,
            start=start,
            end=end,
            sort=sort,
            extra_aggregations=extra_aggregations,
            candidates=candidate_group_ids,
            **parameters
        )

        group_to_score = {}
        for group_id, data in group_data.items():
            group_to_score[group_id] = calculate_cursor_for_group(data)

        paginator_results = SequencePaginator(
            [(score, id) for (id, score) in group_to_score.items()],
            reverse=True,
            **paginator_options
        ).get_result(limit, cursor, count_hits=count_hits)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
示例#4
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the past, but apparently
        #       `retention_window_start` can be None(?), so we need a fallback.
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # maximum number of Group IDs to send down to Snuba,
        # if more Group ID candidates are found, a "bare" Snuba
        # search is performed and the result groups are then
        # post-filtered via queries to the Sentry DB
        max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates')
                return Paginator(Group.objects.none()).get_result()
            elif len(candidate_ids) > max_pre_snuba_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `max_pre_snuba_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates')
                candidate_ids = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = Paginator(Group.objects.none()).get_result()
        result_groups = []
        result_group_ids = set()
        min_score = float('inf')
        max_score = -1

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                project_id=project.id,
                environment_id=environment and environment.id,
                tags=tags,
                start=start,
                end=end,
                sort=sort,
                extra_aggregations=extra_aggregations,
                score_fn=score_fn,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

                    # used for cursor logic
                    min_score = min(min_score, group_score)
                    max_score = max(max_score, group_score)

            # HACK: If a cursor is being used and there may be more results available
            # in Snuba, we need to detect whether the cursor's value will be
            # found in the result groups. If it isn't in the results yet we need to
            # continue querying before we hand off to the paginator to decide whether
            # enough results are found or not, otherwise the paginator will happily
            # return `limit` worth of results that don't take the cursor into account
            # at all, since it can't know there are more results available.
            # TODO: If chunked search works in practice we should probably extend the
            # paginator to throw something if the cursor value is never found, or do
            # something other than partially leak internal paginator logic up to here.
            # Or make separate Paginator implementation just for Snuba search?
            if cursor is not None \
                    and not candidate_ids \
                    and more_results:
                if cursor.is_prev and min_score < cursor.value:
                    continue
                elif not cursor.is_prev and max_score > cursor.value:
                    continue

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
示例#5
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[projects[0].organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} IN ({})'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                            ', '.join(['%s' for e in environments])
                        ),
                    ],
                    params=[environment.id for environment in environments],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if cursor is None \
                    and sort_by == 'date' \
                    and not tags \
                    and not environments \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                return paginator.get_result(limit, cursor, count_hits=False)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        start = max(
            filter(None, [
                retention_date,
                parameters.get('date_from'),
            ])
        )

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # num_candidates is the number of Group IDs to send down to Snuba, if
        # more Group ID candidates are found, a "bare" Snuba search is performed
        # and the result groups are then post-filtered via queries to the Sentry DB
        optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer')
        if optimizer_enabled:
            missed_projects = []
            keys = [self._get_project_count_cache_key(p.id) for p in projects]

            counts_by_projects = {
                self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items()
            }

            missed_projects = {p.id for p in projects} - set(counts_by_projects.keys())

            if missed_projects:
                missing_counts = snuba.query(
                    start=max(
                        filter(None, [
                            retention_window_start,
                            now - timedelta(days=90)
                        ])
                    ),
                    end=now,
                    groupby=['project_id'],
                    filter_keys={
                        'project_id': list(missed_projects),
                    },
                    aggregations=[['uniq', 'group_id', 'group_count']],
                    referrer='search',
                )

                cache.set_many({
                    self._get_project_count_cache_key(project_id): count
                    for project_id, count in missing_counts.items()
                }, options.get('snuba.search.project-group-count-cache-time'))

                counts_by_projects.update(missing_counts)

            min_candidates = options.get('snuba.search.min-pre-snuba-candidates')
            max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
            candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage')

            num_candidates = max(
                min_candidates,
                min(
                    max_candidates,
                    sum(counts_by_projects.values()) * candidates_percentage
                )
            )
        else:
            num_candidates = options.get('snuba.search.min-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if num_candidates and limit <= num_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:num_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates', skip_internal=False)
                return EMPTY_RESULT
            elif len(candidate_ids) > num_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `num_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
                candidate_ids = None

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.
        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results