示例#1
0
    def test_prev_descending_with_new(self):
        joined = timezone.now()

        res1 = self.create_user('*****@*****.**', date_joined=joined)
        res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=1))

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, '-date_joined')
        result1 = paginator.get_result(limit=10, cursor=None)
        assert len(result1) == 2, result1
        assert result1[0] == res2
        assert result1[1] == res1

        res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=2))
        res4 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=3))

        result2 = paginator.get_result(limit=10, cursor=result1.prev)
        assert len(result2) == 2, result2
        assert result2[0] == res4
        assert result2[1] == res3

        result3 = paginator.get_result(limit=10, cursor=result2.prev)
        assert len(result3) == 0, result3

        result4 = paginator.get_result(limit=10, cursor=result1.next)
        assert len(result4) == 0, result4
    def test_simple(self):
        res1 = self.create_user('*****@*****.**')
        res2 = self.create_user('*****@*****.**')
        res3 = self.create_user('*****@*****.**')

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, 'date_joined')
        result1 = paginator.get_result(limit=1, cursor=None)
        assert len(result1) == 1, result1
        assert result1[0] == res1
        assert result1.next
        assert not result1.prev

        result2 = paginator.get_result(limit=2, cursor=result1.next)
        assert len(result2) == 2, result2
        assert result2[0] == res2
        assert result2[1] == res3
        assert not result2.next
        assert result2.prev

        # this is not yet correct
        result3 = paginator.get_result(limit=2, cursor=result2.prev)
        assert len(result3) == 1, list(result3)
        assert result3[0] == res1
        assert result3.next
        assert not result3.prev
示例#3
0
    def test_simple(self):
        res1 = self.create_user('*****@*****.**')
        res2 = self.create_user('*****@*****.**')
        res3 = self.create_user('*****@*****.**')

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, 'date_joined')
        result1 = paginator.get_result(limit=1, cursor=None)
        assert len(result1) == 1, result1
        assert result1[0] == res1
        assert result1.next
        assert not result1.prev

        result2 = paginator.get_result(limit=2, cursor=result1.next)
        assert len(result2) == 2, result2
        assert result2[0] == res2
        assert result2[1] == res3
        assert not result2.next
        assert result2.prev

        # this is not yet correct
        result3 = paginator.get_result(limit=2, cursor=result2.prev)
        assert len(result3) == 1, list(result3)
        assert result3[0] == res1
        assert result3.next
        assert not result3.prev
    def test_descending(self):
        joined = timezone.now()

        res1 = self.create_user('*****@*****.**', date_joined=joined)
        res2 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(seconds=1))
        res3 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(seconds=2))

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, '-date_joined')
        result1 = paginator.get_result(limit=1, cursor=None)
        assert len(result1) == 1, result1
        assert result1[0] == res3
        assert result1.next
        assert not result1.prev

        result2 = paginator.get_result(limit=2, cursor=result1.next)
        assert len(result2) == 2, result2
        assert result2[0] == res2
        assert result2[1] == res1
        assert not result2.next
        assert result2.prev

        result3 = paginator.get_result(limit=2, cursor=result2.prev)
        assert len(result3) == 1, result3
        assert result3[0] == res3
        assert result3.next
        assert not result3.prev
示例#5
0
    def test_descending(self):
        joined = timezone.now()

        res1 = self.create_user('*****@*****.**', date_joined=joined)
        res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=1))
        res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=2))

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, '-date_joined')
        result1 = paginator.get_result(limit=1, cursor=None)
        assert len(result1) == 1, result1
        assert result1[0] == res3
        assert result1.next
        assert not result1.prev

        result2 = paginator.get_result(limit=2, cursor=result1.next)
        assert len(result2) == 2, result2
        assert result2[0] == res2
        assert result2[1] == res1
        assert not result2.next
        assert result2.prev

        result3 = paginator.get_result(limit=2, cursor=result2.prev)
        assert len(result3) == 1, result3
        assert result3[0] == res3
        assert result3.next
        assert not result3.prev
    def test_same_row_updated(self):
        joined = timezone.now()
        res1 = self.create_user('*****@*****.**', date_joined=joined)
        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, '-date_joined')
        result1 = paginator.get_result(limit=3, cursor=None)
        assert len(result1) == 1, result1
        assert result1[0] == res1

        # Prev page should return no results
        result2 = paginator.get_result(limit=3, cursor=result1.prev)
        assert len(result2) == 0, result2

        # If the same row has an updated join date then it should
        # show up on the prev page
        res1.update(date_joined=joined + timedelta(seconds=1))
        result3 = paginator.get_result(limit=3, cursor=result1.prev)
        assert len(result3) == 1, result3
        assert result3[0] == res1

        # Make sure updates work as expected with extra rows
        res1.update(date_joined=res1.date_joined + timedelta(seconds=1))
        res2 = self.create_user(
            '*****@*****.**',
            date_joined=res1.date_joined + timedelta(seconds=1),
        )
        res3 = self.create_user(
            '*****@*****.**',
            date_joined=res1.date_joined + timedelta(seconds=2),
        )
        res4 = self.create_user(
            '*****@*****.**',
            date_joined=res1.date_joined + timedelta(seconds=3),
        )
        result4 = paginator.get_result(limit=1, cursor=result3.prev)
        assert len(result4) == 1, result4
        assert result4[0] == res1

        result5 = paginator.get_result(limit=3, cursor=result3.prev)
        assert len(result5) == 3, result5
        assert result5[0] == res3
        assert result5[1] == res2
        assert result5[2] == res1

        result6 = paginator.get_result(limit=3, cursor=result5.prev)
        assert len(result6) == 1, result6
        assert result6[0] == res4

        res4.update(date_joined=res4.date_joined + timedelta(seconds=1))
        result7 = paginator.get_result(limit=3, cursor=result6.prev)
        assert len(result7) == 1, result7
        assert result7[0] == res4
    def test_rounding_offset(self):
        joined = timezone.now()

        res1 = self.create_user('*****@*****.**', date_joined=joined)
        res2 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(microseconds=1))
        res3 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(microseconds=2))
        res4 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(microseconds=3))

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, 'date_joined')
        result1 = paginator.get_result(limit=3, cursor=None)
        assert len(result1) == 3, result1
        assert result1[0] == res1
        assert result1[1] == res2
        assert result1[2] == res3

        result2 = paginator.get_result(limit=10, cursor=result1.next)
        assert len(result2) == 1, result2
        assert result2[0] == res4

        result3 = paginator.get_result(limit=2, cursor=result2.prev)
        assert len(result3) == 2, result3
        assert result3[0] == res2
        assert result3[1] == res3

        result4 = paginator.get_result(limit=1, cursor=result3.prev)
        assert len(result4) == 1, result4
        assert result4[0] == res1

        result5 = paginator.get_result(limit=10, cursor=result4.prev)
        assert len(result5) == 0, list(result5)
示例#8
0
    def test_same_row_updated(self):
        joined = timezone.now()
        res1 = self.create_user('*****@*****.**', date_joined=joined)
        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, '-date_joined')
        result1 = paginator.get_result(limit=3, cursor=None)
        assert len(result1) == 1, result1
        assert result1[0] == res1

        # Prev page should return no results
        result2 = paginator.get_result(limit=3, cursor=result1.prev)
        assert len(result2) == 0, result2

        # If the same row has an updated join date then it should
        # show up on the prev page
        res1.update(date_joined=joined + timedelta(seconds=1))
        result3 = paginator.get_result(limit=3, cursor=result1.prev)
        assert len(result3) == 1, result3
        assert result3[0] == res1

        # Make sure updates work as expected with extra rows
        res1.update(date_joined=res1.date_joined + timedelta(seconds=1))
        res2 = self.create_user(
            '*****@*****.**',
            date_joined=res1.date_joined + timedelta(seconds=1),
        )
        res3 = self.create_user(
            '*****@*****.**',
            date_joined=res1.date_joined + timedelta(seconds=2),
        )
        res4 = self.create_user(
            '*****@*****.**',
            date_joined=res1.date_joined + timedelta(seconds=3),
        )
        result4 = paginator.get_result(limit=1, cursor=result3.prev)
        assert len(result4) == 1, result4
        assert result4[0] == res1

        result5 = paginator.get_result(limit=3, cursor=result3.prev)
        assert len(result5) == 3, result5
        assert result5[0] == res3
        assert result5[1] == res2
        assert result5[2] == res1

        result6 = paginator.get_result(limit=3, cursor=result5.prev)
        assert len(result6) == 1, result6
        assert result6[0] == res4

        res4.update(date_joined=res4.date_joined + timedelta(seconds=1))
        result7 = paginator.get_result(limit=3, cursor=result6.prev)
        assert len(result7) == 1, result7
        assert result7[0] == res4
    def test_prev_descending_with_new(self):
        joined = timezone.now()

        res1 = self.create_user('*****@*****.**', date_joined=joined)
        res2 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(seconds=1))

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, '-date_joined')
        result1 = paginator.get_result(limit=10, cursor=None)
        assert len(result1) == 2, result1
        assert result1[0] == res2
        assert result1[1] == res1

        res3 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(seconds=2))
        res4 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(seconds=3))

        result2 = paginator.get_result(limit=10, cursor=result1.prev)
        assert len(result2) == 2, result2
        assert result2[0] == res4
        assert result2[1] == res3

        result3 = paginator.get_result(limit=10, cursor=result2.prev)
        assert len(result3) == 0, result3

        result4 = paginator.get_result(limit=10, cursor=result1.next)
        assert len(result4) == 0, result4
示例#10
0
    def test_roudning_offset(self):
        joined = timezone.now()

        res1 = self.create_user('*****@*****.**', date_joined=joined)
        res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=1))
        res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=2))
        res4 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=3))

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, 'date_joined')
        result1 = paginator.get_result(limit=3, cursor=None)
        assert len(result1) == 3, result1
        assert result1[0] == res1
        assert result1[1] == res2
        assert result1[2] == res3

        result2 = paginator.get_result(limit=10, cursor=result1.next)
        assert len(result2) == 1, result2
        assert result2[0] == res4

        result3 = paginator.get_result(limit=2, cursor=result2.prev)
        assert len(result3) == 2, result3
        assert result3[0] == res2
        assert result3[1] == res3

        result4 = paginator.get_result(limit=1, cursor=result3.prev)
        assert len(result4) == 1, result4
        assert result4[0] == res1

        result5 = paginator.get_result(limit=10, cursor=result4.prev)
        assert len(result5) == 0, list(result5)
示例#11
0
    def test_ascending(self):
        joined = timezone.now()

        # The DateTime pager only has accuracy up to 1000th of a second.
        # Everythng can't be added within less than 10 microseconds of each
        # other. This is handled by the pager (see test_rounding_offset), but
        # this case shouldn't rely on it.
        res1 = self.create_user('*****@*****.**', date_joined=joined)
        res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=1))
        res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=2))
        res4 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=3))

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, 'date_joined')
        result1 = paginator.get_result(limit=2, cursor=None)
        assert len(result1) == 2, result1
        assert result1[0] == res1
        assert result1[1] == res2
        assert result1.next
        assert not result1.prev

        result2 = paginator.get_result(limit=2, cursor=result1.next)
        assert len(result2) == 2, result2
        assert result2[0] == res3
        assert result2[1] == res4
        assert not result2.next
        assert result2.prev

        result3 = paginator.get_result(limit=1, cursor=result2.prev)
        assert len(result3) == 1, result3
        assert result3[0] == res2
        assert result3.next
        assert result3.prev

        result4 = paginator.get_result(limit=1, cursor=result3.prev)
        assert len(result4) == 1, result4
        assert result4[0] == res1
        assert result4.next
        assert not result4.prev
    def get_tag_value_paginator(self, project_id, environment_id, key, query=None,
                                order_by='-last_seen'):
        from sentry.api.paginator import DateTimePaginator

        queryset = models.TagValue.objects.filter(
            project_id=project_id,
            key=key,
        )

        if query:
            queryset = queryset.filter(value__contains=query)

        return DateTimePaginator(
            queryset=queryset,
            order_by=order_by,
            on_results=lambda results: map(transformers[models.TagValue], results)
        )
    def test_ascending(self):
        joined = timezone.now()

        # The DateTime pager only has accuracy up to 1000th of a second.
        # Everythng can't be added within less than 10 microseconds of each
        # other. This is handled by the pager (see test_rounding_offset), but
        # this case shouldn't rely on it.
        res1 = self.create_user('*****@*****.**', date_joined=joined)
        res2 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(seconds=1))
        res3 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(seconds=2))
        res4 = self.create_user('*****@*****.**',
                                date_joined=joined + timedelta(seconds=3))

        queryset = User.objects.all()

        paginator = DateTimePaginator(queryset, 'date_joined')
        result1 = paginator.get_result(limit=2, cursor=None)
        assert len(result1) == 2, result1
        assert result1[0] == res1
        assert result1[1] == res2
        assert result1.next
        assert not result1.prev

        result2 = paginator.get_result(limit=2, cursor=result1.next)
        assert len(result2) == 2, result2
        assert result2[0] == res3
        assert result2[1] == res4
        assert not result2.next
        assert result2.prev

        result3 = paginator.get_result(limit=1, cursor=result2.prev)
        assert len(result3) == 1, result3
        assert result3[0] == res2
        assert result3.next
        assert result3.prev

        result4 = paginator.get_result(limit=1, cursor=result3.prev)
        assert len(result4) == 1, result4
        assert result4[0] == res1
        assert result4.next
        assert not result4.prev
示例#14
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the past, but apparently
        #       `retention_window_start` can be None(?), so we need a fallback.
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )

        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if sort_by == 'date' \
                    and not tags \
                    and not environment \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                return paginator.get_result(limit, cursor, count_hits=count_hits)

        assert start < end

        # maximum number of Group IDs to send down to Snuba,
        # if more Group ID candidates are found, a "bare" Snuba
        # search is performed and the result groups are then
        # post-filtered via queries to the Sentry DB
        max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates')
                return Paginator(Group.objects.none()).get_result()
            elif len(candidate_ids) > max_pre_snuba_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `max_pre_snuba_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates')
                candidate_ids = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = Paginator(Group.objects.none()).get_result()
        result_groups = []
        result_group_ids = set()
        min_score = float('inf')
        max_score = -1

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                start=start,
                end=end,
                project_id=project.id,
                environment_id=environment and environment.id,
                tags=tags,
                sort=sort,
                extra_aggregations=extra_aggregations,
                score_fn=score_fn,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

                    # used for cursor logic
                    min_score = min(min_score, group_score)
                    max_score = max(max_score, group_score)

            # HACK: If a cursor is being used and there may be more results available
            # in Snuba, we need to detect whether the cursor's value will be
            # found in the result groups. If it isn't in the results yet we need to
            # continue querying before we hand off to the paginator to decide whether
            # enough results are found or not, otherwise the paginator will happily
            # return `limit` worth of results that don't take the cursor into account
            # at all, since it can't know there are more results available.
            # TODO: If chunked search works in practice we should probably extend the
            # paginator to throw something if the cursor value is never found, or do
            # something other than partially leak internal paginator logic up to here.
            # Or make separate Paginator implementation just for Snuba search?
            if cursor is not None \
                    and not candidate_ids \
                    and more_results:
                if cursor.is_prev and min_score < cursor.value:
                    continue
                elif not cursor.is_prev and max_score > cursor.value:
                    continue

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
示例#15
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[projects[0].organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} IN ({})'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                            ', '.join(['%s' for e in environments])
                        ),
                    ],
                    params=[environment.id for environment in environments],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if cursor is None \
                    and sort_by == 'date' \
                    and not tags \
                    and not environments \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                return paginator.get_result(limit, cursor, count_hits=False)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        start = max(
            filter(None, [
                retention_date,
                parameters.get('date_from'),
            ])
        )

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # num_candidates is the number of Group IDs to send down to Snuba, if
        # more Group ID candidates are found, a "bare" Snuba search is performed
        # and the result groups are then post-filtered via queries to the Sentry DB
        optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer')
        if optimizer_enabled:
            missed_projects = []
            keys = [self._get_project_count_cache_key(p.id) for p in projects]

            counts_by_projects = {
                self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items()
            }

            missed_projects = {p.id for p in projects} - set(counts_by_projects.keys())

            if missed_projects:
                missing_counts = snuba.query(
                    start=max(
                        filter(None, [
                            retention_window_start,
                            now - timedelta(days=90)
                        ])
                    ),
                    end=now,
                    groupby=['project_id'],
                    filter_keys={
                        'project_id': list(missed_projects),
                    },
                    aggregations=[['uniq', 'group_id', 'group_count']],
                    referrer='search',
                )

                cache.set_many({
                    self._get_project_count_cache_key(project_id): count
                    for project_id, count in missing_counts.items()
                }, options.get('snuba.search.project-group-count-cache-time'))

                counts_by_projects.update(missing_counts)

            min_candidates = options.get('snuba.search.min-pre-snuba-candidates')
            max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
            candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage')

            num_candidates = max(
                min_candidates,
                min(
                    max_candidates,
                    sum(counts_by_projects.values()) * candidates_percentage
                )
            )
        else:
            num_candidates = options.get('snuba.search.min-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if num_candidates and limit <= num_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:num_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates', skip_internal=False)
                return EMPTY_RESULT
            elif len(candidate_ids) > num_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `num_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
                candidate_ids = None

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.
        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
示例#16
0
    def query(
        self,
        projects,
        retention_window_start,
        group_queryset,
        environments,
        sort_by,
        limit,
        cursor,
        count_hits,
        paginator_options,
        search_filters,
        date_from,
        date_to,
    ):

        now = timezone.now()
        end = None
        end_params = filter(
            None,
            [date_to, get_search_filter(search_filters, "date", "<")])
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (cursor is None and sort_by == "date" and not environments and
                    # This handles tags and date parameters for search filters.
                    not [
                        sf for sf in search_filters
                        if sf.key.name not in issue_only_fields.union(["date"])
                    ]):
                group_queryset = group_queryset.order_by("-last_seen")
                paginator = DateTimePaginator(group_queryset, "-last_seen",
                                              **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit,
                                            cursor,
                                            count_hits=count_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [retention_window_start, now - timedelta(days=90)]))

        # TODO: We should try and consolidate all this logic together a little
        # better, maybe outside the backend. Should be easier once we're on
        # just the new search filters
        start_params = [
            date_from, retention_date,
            get_search_filter(search_filters, "date", ">")
        ]
        start = max(filter(None, start_params))

        end = max([retention_date, end])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get("snuba.search.max-pre-snuba-candidates")
        too_many_candidates = False
        candidate_ids = list(
            group_queryset.values_list("id", flat=True)[:max_candidates + 1])
        metrics.timing("snuba.search.num_candidates", len(candidate_ids))
        if not candidate_ids:
            # no matches could possibly be found from this point on
            metrics.incr("snuba.search.no_candidates", skip_internal=False)
            return EMPTY_RESULT
        elif len(candidate_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr("snuba.search.too_many_candidates",
                         skip_internal=False)
            too_many_candidates = True
            candidate_ids = []

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get("snuba.search.chunk-growth-rate")
        max_chunk_size = options.get("snuba.search.max-chunk-size")
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = None

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get("snuba.search.max-total-chunk-time-seconds")
        time_start = time.time()

        if count_hits and (too_many_candidates or cursor is not None):
            # If we had too many candidates to reasonably pass down to snuba,
            # or if we have a cursor that bisects the overall result set (such
            # that our query only sees results on one side of the cursor) then
            # we need an alternative way to figure out the total hits that this
            # query has.

            # To do this, we get a sample of groups matching the snuba side of
            # the query, and see how many of those pass the post-filter in
            # postgres. This should give us an estimate of the total number of
            # snuba matches that will be overall matches, which we can use to
            # get an estimate for X-Hits.

            # The sampling is not simple random sampling. It will return *all*
            # matching groups if there are less than N groups matching the
            # query, or it will return a random, deterministic subset of N of
            # the groups if there are more than N overall matches. This means
            # that the "estimate" is actually an accurate result when there are
            # less than N matching groups.

            # The number of samples required to achieve a certain error bound
            # with a certain confidence interval can be calculated from a
            # rearrangement of the normal approximation (Wald) confidence
            # interval formula:
            #
            # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
            #
            # Effectively if we want the estimate to be within +/- 10% of the
            # real value with 95% confidence, we would need (1.96^2 * p*(1-p))
            # / 0.1^2 samples. With a starting assumption of p=0.5 (this
            # requires the most samples) we would need 96 samples to achieve
            # +/-10% @ 95% confidence.

            sample_size = options.get("snuba.search.hits-sample-size")
            snuba_groups, snuba_total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                sort_field=sort_field,
                limit=sample_size,
                offset=0,
                get_sample=True,
                search_filters=search_filters,
            )
            snuba_count = len(snuba_groups)
            if snuba_count == 0:
                return EMPTY_RESULT
            else:
                filtered_count = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]).count()

                hit_ratio = filtered_count / float(snuba_count)
                hits = int(hit_ratio * snuba_total)

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing("snuba.search.num_snuba_results", len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the candidate_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid
                            for gid, _ in snuba_groups]).values_list("id",
                                                                     flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options).get_result(limit, cursor, known_hits=hits)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids or len(
                    paginator_results.results) >= limit or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev
                                   or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing("snuba.search.num_chunks", num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]

        return paginator_results
示例#17
0
    def get(self, request, organization, member):
        # There is an activity record created for both sides of the unmerge
        # operation, so we only need to include one of them here to avoid
        # showing the same entry twice.
        base_qs = Activity.objects.exclude(
            type=Activity.UNMERGE_SOURCE).values_list("id", flat=True)

        # To make this query efficient, we have to hammer it into a weird format. This table is
        # extremely large and if we are making a query across many projects, in a lot of cases
        # Postgres decides that the best query plan is to iterate backwards on the datetime index.
        # This means it does something close to a table scan to get the results it wants - this can
        # be the case even for orgs with less than a page of activity rows, and often results in
        # queries that take > 30s, which get killed by stomper.
        # To convince Postgres to use the index on `(project_id, datetime)`, it basically needs to
        # see queries that filter on a single project id and then order by datetime. So we replicate
        # the query for every project and UNION ALL them together to get the candidate set of rows.
        # Then we sort these and return the final result. Convoluted, but it improves the query a
        # lot.
        # To make this work well with pagination, we have to also apply the pagination queries to
        # the subqueries.
        cursor = self.get_cursor_from_request(request)
        paginator = DateTimePaginator(base_qs, order_by="-datetime")
        if cursor is not None and cursor.value:
            cursor_value = paginator.value_from_cursor(cursor)
        else:
            cursor_value = 0
        base_qs = paginator.build_queryset(cursor_value, False)

        project_ids = list(
            Project.objects.filter(
                organization=organization,
                teams__in=OrganizationMemberTeam.objects.filter(
                    organizationmember=member).values("team"),
            ).values_list("id", flat=True))

        union_qs = Activity.objects.none()
        if project_ids:
            union_qs = reduce(
                lambda qs1, qs2: qs1.union(qs2, all=True),
                [
                    base_qs.filter(project_id=project)[:paginator.max_limit]
                    for project in project_ids
                ],
            )

        # We do `select_related` here to make the unions less heavy. This way we only join these
        # table for the rows we actually want.
        queryset = Activity.objects.filter(
            id__in=union_qs[:paginator.max_limit]).select_related(
                "project", "group", "user")

        return self.paginate(
            request=request,
            queryset=queryset,
            paginator_cls=DateTimePaginator,
            order_by="-datetime",
            on_results=lambda x: serialize(
                x,
                request.user,
                OrganizationActivitySerializer(environment_func=self.
                                               _get_environment_func(
                                                   request, organization.id)),
            ),
        )
def inbox_search(
    projects: Sequence[Project],
    environments: Optional[Sequence[Environment]] = None,
    limit: int = 100,
    cursor: Optional[Cursor] = None,
    count_hits: bool = False,
    search_filters: Optional[Sequence[SearchFilter]] = None,
    date_from: Optional[datetime] = None,
    date_to: Optional[datetime] = None,
    max_hits: Optional[int] = None,
) -> CursorResult:
    now: datetime = timezone.now()
    end: Optional[datetime] = None
    end_params: List[datetime] = [
        _f for _f in [date_to,
                      get_search_filter(search_filters, "date", "<")] if _f
    ]
    if end_params:
        end = min(end_params)

    end = end if end else now + ALLOWED_FUTURE_DELTA

    # We only want to search back a week at most, since that's the oldest inbox rows
    # can be.
    earliest_date = now - timedelta(days=7)
    start_params = [
        date_from, earliest_date,
        get_search_filter(search_filters, "date", ">")
    ]
    start = max([_f for _f in start_params if _f])
    end = max([earliest_date, end])

    if start >= end:
        return Paginator(Group.objects.none()).get_result()

    # Make sure search terms are valid
    invalid_search_terms = [
        str(sf) for sf in search_filters
        if sf.key.name not in allowed_inbox_search_terms
    ]
    if invalid_search_terms:
        raise InvalidSearchQuery(
            f"Invalid search terms for 'inbox' search: {invalid_search_terms}")

    # Make sure this is an inbox search
    if not get_search_filter(search_filters, "for_review", "="):
        raise InvalidSearchQuery(
            "Sort key 'inbox' only supported for inbox search")

    if get_search_filter(search_filters, "status",
                         "=") != GroupStatus.UNRESOLVED:
        raise InvalidSearchQuery(
            "Inbox search only works for 'unresolved' status")

    # We just filter on `GroupInbox.date_added` here, and don't filter by date
    # on the group. This keeps the query simpler and faster in some edge cases,
    # and date_added is a good enough proxy when we're using this sort.
    qs = GroupInbox.objects.filter(
        date_added__gte=start,
        date_added__lte=end,
        project__in=projects,
    )

    if environments is not None:
        environment_ids: List[int] = [
            environment.id for environment in environments
        ]
        qs = qs.filter(group_id__in=GroupEnvironment.objects.filter(
            environment_id__in=environment_ids).values_list(
                "group_id", flat=True).distinct())

    owner_search = get_search_filter(search_filters, "assigned_or_suggested",
                                     "=")
    if owner_search:
        qs = qs.filter(
            assigned_or_suggested_filter(owner_search,
                                         projects,
                                         field_filter="group_id"))

    paginator = DateTimePaginator(qs.order_by("date_added"), "-date_added")
    results = paginator.get_result(limit,
                                   cursor,
                                   count_hits=count_hits,
                                   max_hits=max_hits)

    # We want to return groups from the endpoint, but have the cursor be related to the
    # GroupInbox rows. So we paginate on the GroupInbox results queryset, then fetch
    # the group_ids out and use them to get the actual groups.
    group_qs = Group.objects.filter(
        id__in=[r.group_id for r in results.results],
        project__in=projects,
        status=GroupStatus.UNRESOLVED,
    )
    groups: Mapping[int, Group] = {g.id: g for g in group_qs}
    results.results = [
        groups[r.group_id] for r in results.results if r.group_id in groups
    ]
    return results
示例#19
0
    def _query(self, projects, retention_window_start, group_queryset, tags, environments,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[projects[0].organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} IN ({})'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                            ', '.join(['%s' for e in environments])
                        ),
                    ],
                    params=[environment.id for environment in environments],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        now = timezone.now()
        end = parameters.get('date_to')
        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if cursor is None \
                    and sort_by == 'date' \
                    and not tags \
                    and not environments \
                    and not any(param in parameters for param in [
                        'age_from', 'age_to', 'last_seen_from',
                        'last_seen_to', 'times_seen', 'times_seen_lower',
                        'times_seen_upper'
                    ]):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit, cursor, count_hits=count_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        start = max(
            filter(None, [
                retention_date,
                parameters.get('date_from'),
            ])
        )

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
        candidate_ids = list(
            group_queryset.values_list('id', flat=True)[:max_candidates + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_ids))
        if not candidate_ids:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates', skip_internal=False)
            return EMPTY_RESULT
        elif len(candidate_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
            candidate_ids = None

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = None

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        if count_hits and candidate_ids is None:
            # If we have no candidates, get a random sample of groups matching
            # the snuba side of the query, and see how many of those pass the
            # post-filter in postgres. This should give us an estimate of the
            # total number of snuba matches that will be overall matches, which
            # we can use to get an estimate for X-Hits. Note no cursor, so we
            # are always estimating the total hits.

            # The number of samples required to achieve a certain error bound
            # with a certain confidence interval can be calculated from a
            # rearrangement of the normal approximation (Wald) confidence
            # interval formula:
            #
            # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
            #
            # Effectively if we want the estimate to be within +/- 10% of the
            # real value with 95% confidence, we would need (1.96^2 * p*(1-p))
            # / 0.1^2 samples. With a starting assumption of p=0.5 (this
            # requires the most samples) we would need 96 samples to achieve
            # +/-10% @ 95% confidence.

            sample_size = options.get('snuba.search.hits-sample-size')
            snuba_groups, snuba_total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                limit=sample_size,
                offset=0,
                get_sample=True,
                **parameters
            )
            snuba_count = len(snuba_groups)
            if snuba_count == 0:
                return EMPTY_RESULT
            else:
                filtered_count = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).count()

                hit_ratio = filtered_count / float(snuba_count)
                hits = int(hit_ratio * snuba_total)

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                tags=tags,
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the candidate_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

                if count_hits:
                    if not more_results:
                        # We know we have got all possible groups from snuba and filtered
                        # them all down, so we have all hits.
                        # TODO this probably doesn't work because we could be on page N
                        # and not be including hits from previous pages.
                        hits = len(result_groups)
                    else:
                        # We also could have underestimated hits from our sample and have
                        # already seen more hits than the estimate, so make sure hits is
                        # at least as big as what we have seen.
                        hits = max(hits, len(result_groups))

            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, known_hits=hits)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
示例#20
0
    def query(
        self,
        projects,
        retention_window_start,
        group_queryset,
        environments,
        sort_by,
        limit,
        cursor,
        count_hits,
        paginator_options,
        search_filters,
        date_from,
        date_to,
        max_hits=None,
    ):

        now = timezone.now()
        end = None
        end_params = [
            _f for _f in
            [date_to, get_search_filter(search_filters, "date", "<")] if _f
        ]
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            metrics.incr("snuba.search.postgres_only")

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (cursor is None and sort_by == "date" and
                    # This handles tags and date parameters for search filters.
                    not [
                        sf for sf in search_filters if sf.key.name not in
                        self.postgres_only_fields.union(["date"])
                    ]):
                group_queryset = group_queryset.order_by("-last_seen")
                paginator = DateTimePaginator(group_queryset, "-last_seen",
                                              **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit,
                                            cursor,
                                            count_hits=count_hits,
                                            max_hits=max_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            _f for _f in [retention_window_start, now - timedelta(days=90)]
            if _f)
        start_params = [
            date_from, retention_date,
            get_search_filter(search_filters, "date", ">")
        ]
        start = max(_f for _f in start_params if _f)
        end = max([retention_date, end])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return self.empty_result

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return self.empty_result

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get("snuba.search.max-pre-snuba-candidates")

        with sentry_sdk.start_span(op="snuba_group_query") as span:
            group_ids = list(
                group_queryset.values_list("id",
                                           flat=True)[:max_candidates + 1])
            span.set_data("Max Candidates", max_candidates)
            span.set_data("Result Size", len(group_ids))
        metrics.timing("snuba.search.num_candidates", len(group_ids))

        too_many_candidates = False
        if not group_ids:
            # no matches could possibly be found from this point on
            metrics.incr("snuba.search.no_candidates", skip_internal=False)
            return self.empty_result
        elif len(group_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'status',
            # 'bookmarked_by', 'assigned_to', 'unassigned', or 'subscribed_by')
            # then it might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr("snuba.search.too_many_candidates",
                         skip_internal=False)
            too_many_candidates = True
            group_ids = []

        sort_field = self.sort_strategies[sort_by]
        chunk_growth = options.get("snuba.search.chunk-growth-rate")
        max_chunk_size = options.get("snuba.search.max-chunk-size")
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = self.calculate_hits(
            group_ids,
            too_many_candidates,
            sort_field,
            projects,
            retention_window_start,
            group_queryset,
            environments,
            sort_by,
            limit,
            cursor,
            count_hits,
            paginator_options,
            search_filters,
            start,
            end,
        )
        if count_hits and hits == 0:
            return self.empty_result

        paginator_results = self.empty_result
        result_groups = []
        result_group_ids = set()

        max_time = options.get("snuba.search.max-total-chunk-time-seconds")
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have group_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(group_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = self.snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                organization_id=projects[0].organization_id,
                sort_field=sort_field,
                cursor=cursor,
                group_ids=group_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing("snuba.search.num_snuba_results", len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if group_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the group_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid
                            for gid, _ in snuba_groups]).values_list("id",
                                                                     flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options).get_result(limit,
                                                cursor,
                                                known_hits=hits,
                                                max_hits=max_hits)

            if group_ids or len(
                    paginator_results.results) >= limit or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev
                                   or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing("snuba.search.num_chunks", num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]

        return paginator_results
示例#21
0
    def _query(self, projects, retention_window_start, group_queryset, environments,
               sort_by, limit, cursor, count_hits, paginator_options, search_filters,
               date_from, date_to):

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environments is not None:
            environment_ids = [environment.id for environment in environments]
            group_queryset = group_queryset.filter(
                groupenvironment__environment_id__in=environment_ids
            )
            group_queryset = QuerySetBuilder({
                'first_release': QCallbackCondition(
                    lambda version: Q(
                        groupenvironment__first_release__organization_id=projects[0].organization_id,
                        groupenvironment__first_release__version=version,
                        groupenvironment__environment_id__in=environment_ids,
                    )
                ),
                'first_seen': ScalarCondition(
                    'groupenvironment__first_seen',
                    {'groupenvironment__environment_id__in': environment_ids}
                ),
            }).build(group_queryset, search_filters)
        else:
            group_queryset = QuerySetBuilder({
                'first_release': QCallbackCondition(
                    lambda version: Q(
                        first_release__organization_id=projects[0].organization_id,
                        first_release__version=version,
                    ),
                ),
                'first_seen': ScalarCondition('first_seen'),
            }).build(group_queryset, search_filters)

        now = timezone.now()
        end = None
        end_params = filter(
            None,
            [date_to, get_search_filter(search_filters, 'date', '<')],
        )
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (
                cursor is None and
                sort_by == 'date' and
                not environments and
                # This handles tags and date parameters for search filters.
                not [
                    sf for sf in search_filters
                    if sf.key.name not in issue_only_fields.union(['date', 'message'])
                ]
            ):
                group_queryset = group_queryset.order_by('-last_seen')
                paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit, cursor, count_hits=count_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [
                retention_window_start,
                now - timedelta(days=90)
            ])
        )

        # TODO: We should try and consolidate all this logic together a little
        # better, maybe outside the backend. Should be easier once we're on
        # just the new search filters
        start_params = [
            date_from,
            retention_date,
            get_search_filter(search_filters, 'date', '>'),
        ]
        start = max(filter(None, start_params))

        end = max([
            retention_date,
            end
        ])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatability
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatability with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get('snuba.search.max-pre-snuba-candidates')
        too_many_candidates = False
        candidate_ids = list(
            group_queryset.values_list('id', flat=True)[:max_candidates + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_ids))
        if not candidate_ids:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates', skip_internal=False)
            return EMPTY_RESULT
        elif len(candidate_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates', skip_internal=False)
            too_many_candidates = True
            candidate_ids = []

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = None

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        if count_hits and (too_many_candidates or cursor is not None):
            # If we had too many candidates to reasonably pass down to snuba,
            # or if we have a cursor that bisects the overall result set (such
            # that our query only sees results on one side of the cursor) then
            # we need an alternative way to figure out the total hits that this
            # query has.

            # To do this, we get a sample of groups matching the snuba side of
            # the query, and see how many of those pass the post-filter in
            # postgres. This should give us an estimate of the total number of
            # snuba matches that will be overall matches, which we can use to
            # get an estimate for X-Hits.

            # The sampling is not simple random sampling. It will return *all*
            # matching groups if there are less than N groups matching the
            # query, or it will return a random, deterministic subset of N of
            # the groups if there are more than N overall matches. This means
            # that the "estimate" is actually an accurate result when there are
            # less than N matching groups.

            # The number of samples required to achieve a certain error bound
            # with a certain confidence interval can be calculated from a
            # rearrangement of the normal approximation (Wald) confidence
            # interval formula:
            #
            # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
            #
            # Effectively if we want the estimate to be within +/- 10% of the
            # real value with 95% confidence, we would need (1.96^2 * p*(1-p))
            # / 0.1^2 samples. With a starting assumption of p=0.5 (this
            # requires the most samples) we would need 96 samples to achieve
            # +/-10% @ 95% confidence.

            sample_size = options.get('snuba.search.hits-sample-size')
            snuba_groups, snuba_total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                sort_field=sort_field,
                limit=sample_size,
                offset=0,
                get_sample=True,
                search_filters=search_filters,
            )
            snuba_count = len(snuba_groups)
            if snuba_count == 0:
                return EMPTY_RESULT
            else:
                filtered_count = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).count()

                hit_ratio = filtered_count / float(snuba_count)
                hits = int(hit_ratio * snuba_total)

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments and [environment.id for environment in environments],
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the candidate_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, known_hits=hits)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results