def test_descending(self): joined = timezone.now() res1 = self.create_user('*****@*****.**', date_joined=joined) res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=1)) res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=2)) queryset = User.objects.all() paginator = DateTimePaginator(queryset, '-date_joined') result1 = paginator.get_result(limit=1, cursor=None) assert len(result1) == 1, result1 assert result1[0] == res3 assert result1.next assert not result1.prev result2 = paginator.get_result(limit=2, cursor=result1.next) assert len(result2) == 2, result2 assert result2[0] == res2 assert result2[1] == res1 assert not result2.next assert result2.prev result3 = paginator.get_result(limit=2, cursor=result2.prev) assert len(result3) == 1, result3 assert result3[0] == res3 assert result3.next assert not result3.prev
def test_roudning_offset(self): joined = timezone.now() res1 = self.create_user('*****@*****.**', date_joined=joined) res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=1)) res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=2)) res4 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=3)) queryset = User.objects.all() paginator = DateTimePaginator(queryset, 'date_joined') result1 = paginator.get_result(limit=3, cursor=None) assert len(result1) == 3, result1 assert result1[0] == res1 assert result1[1] == res2 assert result1[2] == res3 result2 = paginator.get_result(limit=10, cursor=result1.next) assert len(result2) == 1, result2 assert result2[0] == res4 result3 = paginator.get_result(limit=2, cursor=result2.prev) assert len(result3) == 2, result3 assert result3[0] == res2 assert result3[1] == res3 result4 = paginator.get_result(limit=1, cursor=result3.prev) assert len(result4) == 1, result4 assert result4[0] == res1 result5 = paginator.get_result(limit=10, cursor=result4.prev) assert len(result5) == 0, list(result5)
def test_simple(self): res1 = self.create_user('*****@*****.**') res2 = self.create_user('*****@*****.**') res3 = self.create_user('*****@*****.**') queryset = User.objects.all() paginator = DateTimePaginator(queryset, 'date_joined') result1 = paginator.get_result(limit=1, cursor=None) assert len(result1) == 1, result1 assert result1[0] == res1 assert result1.next assert not result1.prev result2 = paginator.get_result(limit=2, cursor=result1.next) assert len(result2) == 2, result2 assert result2[0] == res2 assert result2[1] == res3 assert not result2.next assert result2.prev # this is not yet correct result3 = paginator.get_result(limit=2, cursor=result2.prev) assert len(result3) == 1, list(result3) assert result3[0] == res1 assert result3.next assert not result3.prev
def test_prev_descending_with_new(self): joined = timezone.now() res1 = self.create_user('*****@*****.**', date_joined=joined) res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=1)) queryset = User.objects.all() paginator = DateTimePaginator(queryset, '-date_joined') result1 = paginator.get_result(limit=10, cursor=None) assert len(result1) == 2, result1 assert result1[0] == res2 assert result1[1] == res1 res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=2)) res4 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=3)) result2 = paginator.get_result(limit=10, cursor=result1.prev) assert len(result2) == 2, result2 assert result2[0] == res4 assert result2[1] == res3 result3 = paginator.get_result(limit=10, cursor=result2.prev) assert len(result3) == 0, result3 result4 = paginator.get_result(limit=10, cursor=result1.next) assert len(result4) == 0, result4
def test_rounding_offset(self): joined = timezone.now() res1 = self.create_user('*****@*****.**', date_joined=joined) res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=1)) res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=2)) res4 = self.create_user('*****@*****.**', date_joined=joined + timedelta(microseconds=3)) queryset = User.objects.all() paginator = DateTimePaginator(queryset, 'date_joined') result1 = paginator.get_result(limit=3, cursor=None) assert len(result1) == 3, result1 assert result1[0] == res1 assert result1[1] == res2 assert result1[2] == res3 result2 = paginator.get_result(limit=10, cursor=result1.next) assert len(result2) == 1, result2 assert result2[0] == res4 result3 = paginator.get_result(limit=2, cursor=result2.prev) assert len(result3) == 2, result3 assert result3[0] == res2 assert result3[1] == res3 result4 = paginator.get_result(limit=1, cursor=result3.prev) assert len(result4) == 1, result4 assert result4[0] == res1 result5 = paginator.get_result(limit=10, cursor=result4.prev) assert len(result5) == 0, list(result5)
def test_ascending(self): joined = timezone.now() # The DateTime pager only has accuracy up to 1000th of a second. # Everythng can't be added within less than 10 microseconds of each # other. This is handled by the pager (see test_rounding_offset), but # this case shouldn't rely on it. res1 = self.create_user('*****@*****.**', date_joined=joined) res2 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=1)) res3 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=2)) res4 = self.create_user('*****@*****.**', date_joined=joined + timedelta(seconds=3)) queryset = User.objects.all() paginator = DateTimePaginator(queryset, 'date_joined') result1 = paginator.get_result(limit=2, cursor=None) assert len(result1) == 2, result1 assert result1[0] == res1 assert result1[1] == res2 assert result1.next assert not result1.prev result2 = paginator.get_result(limit=2, cursor=result1.next) assert len(result2) == 2, result2 assert result2[0] == res3 assert result2[1] == res4 assert not result2.next assert result2.prev result3 = paginator.get_result(limit=1, cursor=result2.prev) assert len(result3) == 1, result3 assert result3[0] == res2 assert result3.next assert result3.prev result4 = paginator.get_result(limit=1, cursor=result3.prev) assert len(result4) == 1, result4 assert result4[0] == res1 assert result4.next assert not result4.prev
def test_same_row_updated(self): joined = timezone.now() res1 = self.create_user('*****@*****.**', date_joined=joined) queryset = User.objects.all() paginator = DateTimePaginator(queryset, '-date_joined') result1 = paginator.get_result(limit=3, cursor=None) assert len(result1) == 1, result1 assert result1[0] == res1 # Prev page should return no results result2 = paginator.get_result(limit=3, cursor=result1.prev) assert len(result2) == 0, result2 # If the same row has an updated join date then it should # show up on the prev page res1.update(date_joined=joined + timedelta(seconds=1)) result3 = paginator.get_result(limit=3, cursor=result1.prev) assert len(result3) == 1, result3 assert result3[0] == res1 # Make sure updates work as expected with extra rows res1.update(date_joined=res1.date_joined + timedelta(seconds=1)) res2 = self.create_user( '*****@*****.**', date_joined=res1.date_joined + timedelta(seconds=1), ) res3 = self.create_user( '*****@*****.**', date_joined=res1.date_joined + timedelta(seconds=2), ) res4 = self.create_user( '*****@*****.**', date_joined=res1.date_joined + timedelta(seconds=3), ) result4 = paginator.get_result(limit=1, cursor=result3.prev) assert len(result4) == 1, result4 assert result4[0] == res1 result5 = paginator.get_result(limit=3, cursor=result3.prev) assert len(result5) == 3, result5 assert result5[0] == res3 assert result5[1] == res2 assert result5[2] == res1 result6 = paginator.get_result(limit=3, cursor=result5.prev) assert len(result6) == 1, result6 assert result6[0] == res4 res4.update(date_joined=res4.date_joined + timedelta(seconds=1)) result7 = paginator.get_result(limit=3, cursor=result6.prev) assert len(result7) == 1, result7 assert result7[0] == res4
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) return paginator.get_result(limit, cursor, count_hits=False) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # num_candidates is the number of Group IDs to send down to Snuba, if # more Group ID candidates are found, a "bare" Snuba search is performed # and the result groups are then post-filtered via queries to the Sentry DB optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer') if optimizer_enabled: missed_projects = [] keys = [self._get_project_count_cache_key(p.id) for p in projects] counts_by_projects = { self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items() } missed_projects = {p.id for p in projects} - set(counts_by_projects.keys()) if missed_projects: missing_counts = snuba.query( start=max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ), end=now, groupby=['project_id'], filter_keys={ 'project_id': list(missed_projects), }, aggregations=[['uniq', 'group_id', 'group_count']], referrer='search', ) cache.set_many({ self._get_project_count_cache_key(project_id): count for project_id, count in missing_counts.items() }, options.get('snuba.search.project-group-count-cache-time')) counts_by_projects.update(missing_counts) min_candidates = options.get('snuba.search.min-pre-snuba-candidates') max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage') num_candidates = max( min_candidates, min( max_candidates, sum(counts_by_projects.values()) * candidates_percentage ) ) else: num_candidates = options.get('snuba.search.min-pre-snuba-candidates') # pre-filter query candidate_ids = None if num_candidates and limit <= num_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:num_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > num_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `num_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the past, but apparently # `retention_window_start` can be None(?), so we need a fallback. start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if sort_by == 'date' \ and not tags \ and not environment \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits) assert start < end # maximum number of Group IDs to send down to Snuba, # if more Group ID candidates are found, a "bare" Snuba # search is performed and the result groups are then # post-filtered via queries to the Sentry DB max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates') # pre-filter query candidate_ids = None if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_ids) > max_pre_snuba_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_pre_snuba_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_ids = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = Paginator(Group.objects.none()).get_result() result_groups = [] result_group_ids = set() min_score = float('inf') max_score = -1 max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( start=start, end=end, project_id=project.id, environment_id=environment and environment.id, tags=tags, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # used for cursor logic min_score = min(min_score, group_score) max_score = max(max_score, group_score) # HACK: If a cursor is being used and there may be more results available # in Snuba, we need to detect whether the cursor's value will be # found in the result groups. If it isn't in the results yet we need to # continue querying before we hand off to the paginator to decide whether # enough results are found or not, otherwise the paginator will happily # return `limit` worth of results that don't take the cursor into account # at all, since it can't know there are more results available. # TODO: If chunked search works in practice we should probably extend the # paginator to throw something if the cursor value is never found, or do # something other than partially leak internal paginator logic up to here. # Or make separate Paginator implementation just for Snuba search? if cursor is not None \ and not candidate_ids \ and more_results: if cursor.is_prev and min_score < cursor.value: continue elif not cursor.is_prev and max_score > cursor.value: continue paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() if count_hits and candidate_ids is None: # If we have no candidates, get a random sample of groups matching # the snuba side of the query, and see how many of those pass the # post-filter in postgres. This should give us an estimate of the # total number of snuba matches that will be overall matches, which # we can use to get an estimate for X-Hits. Note no cursor, so we # are always estimating the total hits. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get('snuba.search.hits-sample-size') snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, **parameters ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) if count_hits: if not more_results: # We know we have got all possible groups from snuba and filtered # them all down, so we have all hits. # TODO this probably doesn't work because we could be on page N # and not be including hits from previous pages. hits = len(result_groups) else: # We also could have underestimated hits from our sample and have # already seen more hits than the estimate, so make sure hits is # at least as big as what we have seen. hits = max(hits, len(result_groups)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to): # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: environment_ids = [environment.id for environment in environments] group_queryset = group_queryset.filter( groupenvironment__environment_id__in=environment_ids ) group_queryset = QuerySetBuilder({ 'first_release': QCallbackCondition( lambda version: Q( groupenvironment__first_release__organization_id=projects[0].organization_id, groupenvironment__first_release__version=version, groupenvironment__environment_id__in=environment_ids, ) ), 'first_seen': ScalarCondition( 'groupenvironment__first_seen', {'groupenvironment__environment_id__in': environment_ids} ), }).build(group_queryset, search_filters) else: group_queryset = QuerySetBuilder({ 'first_release': QCallbackCondition( lambda version: Q( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), 'first_seen': ScalarCondition('first_seen'), }).build(group_queryset, search_filters) now = timezone.now() end = None end_params = filter( None, [date_to, get_search_filter(search_filters, 'date', '<')], ) if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if ( cursor is None and sort_by == 'date' and not environments and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in issue_only_fields.union(['date', 'message']) ] ): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) # TODO: We should try and consolidate all this logic together a little # better, maybe outside the backend. Should be easier once we're on # just the new search filters start_params = [ date_from, retention_date, get_search_filter(search_filters, 'date', '>'), ] start = max(filter(None, start_params)) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get('snuba.search.max-pre-snuba-candidates') too_many_candidates = False candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) too_many_candidates = True candidate_ids = [] sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() if count_hits and (too_many_candidates or cursor is not None): # If we had too many candidates to reasonably pass down to snuba, # or if we have a cursor that bisects the overall result set (such # that our query only sees results on one side of the cursor) then # we need an alternative way to figure out the total hits that this # query has. # To do this, we get a sample of groups matching the snuba side of # the query, and see how many of those pass the post-filter in # postgres. This should give us an estimate of the total number of # snuba matches that will be overall matches, which we can use to # get an estimate for X-Hits. # The sampling is not simple random sampling. It will return *all* # matching groups if there are less than N groups matching the # query, or it will return a random, deterministic subset of N of # the groups if there are more than N overall matches. This means # that the "estimate" is actually an accurate result when there are # less than N matching groups. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get('snuba.search.hits-sample-size') snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, search_filters=search_filters, ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids)) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, max_hits=None, ): now = timezone.now() end = None end_params = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA metrics.incr("snuba.search.postgres_only") # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in self.postgres_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( _f for _f in [retention_window_start, now - timedelta(days=90)] if _f) start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max(_f for _f in start_params if _f) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") with sentry_sdk.start_span(op="snuba_group_query") as span: group_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) span.set_data("Max Candidates", max_candidates) span.set_data("Result Size", len(group_ids)) metrics.timing("snuba.search.num_candidates", len(group_ids)) too_many_candidates = False if not group_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return self.empty_result elif len(group_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'status', # 'bookmarked_by', 'assigned_to', 'unassigned', or 'subscribed_by') # then it might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True group_ids = [] sort_field = self.sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = self.calculate_hits( group_ids, too_many_candidates, sort_field, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, start, end, ) if count_hits and hits == 0: return self.empty_result paginator_results = self.empty_result result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have group_ids always query for at least that many items chunk_limit = max(chunk_limit, len(group_ids)) # {group_id: group_score, ...} snuba_groups, total = self.snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], organization_id=projects[0].organization_id, sort_field=sort_field, cursor=cursor, group_ids=group_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if group_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the group_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits, max_hits=max_hits) if group_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, ): now = timezone.now() end = None end_params = filter( None, [date_to, get_search_filter(search_filters, "date", "<")]) if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and not environments and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in issue_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [retention_window_start, now - timedelta(days=90)])) # TODO: We should try and consolidate all this logic together a little # better, maybe outside the backend. Should be easier once we're on # just the new search filters start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max(filter(None, start_params)) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") too_many_candidates = False candidate_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) metrics.timing("snuba.search.num_candidates", len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True candidate_ids = [] sort_field = sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() if count_hits and (too_many_candidates or cursor is not None): # If we had too many candidates to reasonably pass down to snuba, # or if we have a cursor that bisects the overall result set (such # that our query only sees results on one side of the cursor) then # we need an alternative way to figure out the total hits that this # query has. # To do this, we get a sample of groups matching the snuba side of # the query, and see how many of those pass the post-filter in # postgres. This should give us an estimate of the total number of # snuba matches that will be overall matches, which we can use to # get an estimate for X-Hits. # The sampling is not simple random sampling. It will return *all* # matching groups if there are less than N groups matching the # query, or it will return a random, deterministic subset of N of # the groups if there are more than N overall matches. This means # that the "estimate" is actually an accurate result when there are # less than N matching groups. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get("snuba.search.hits-sample-size") snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, search_filters=search_filters, ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids)) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def inbox_search( projects: Sequence[Project], environments: Optional[Sequence[Environment]] = None, limit: int = 100, cursor: Optional[Cursor] = None, count_hits: bool = False, search_filters: Optional[Sequence[SearchFilter]] = None, date_from: Optional[datetime] = None, date_to: Optional[datetime] = None, max_hits: Optional[int] = None, ) -> CursorResult: now: datetime = timezone.now() end: Optional[datetime] = None end_params: List[datetime] = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) end = end if end else now + ALLOWED_FUTURE_DELTA # We only want to search back a week at most, since that's the oldest inbox rows # can be. earliest_date = now - timedelta(days=7) start_params = [ date_from, earliest_date, get_search_filter(search_filters, "date", ">") ] start = max([_f for _f in start_params if _f]) end = max([earliest_date, end]) if start >= end: return Paginator(Group.objects.none()).get_result() # Make sure search terms are valid invalid_search_terms = [ str(sf) for sf in search_filters if sf.key.name not in allowed_inbox_search_terms ] if invalid_search_terms: raise InvalidSearchQuery( f"Invalid search terms for 'inbox' search: {invalid_search_terms}") # Make sure this is an inbox search if not get_search_filter(search_filters, "for_review", "="): raise InvalidSearchQuery( "Sort key 'inbox' only supported for inbox search") if get_search_filter(search_filters, "status", "=") != GroupStatus.UNRESOLVED: raise InvalidSearchQuery( "Inbox search only works for 'unresolved' status") # We just filter on `GroupInbox.date_added` here, and don't filter by date # on the group. This keeps the query simpler and faster in some edge cases, # and date_added is a good enough proxy when we're using this sort. qs = GroupInbox.objects.filter( date_added__gte=start, date_added__lte=end, project__in=projects, ) if environments is not None: environment_ids: List[int] = [ environment.id for environment in environments ] qs = qs.filter(group_id__in=GroupEnvironment.objects.filter( environment_id__in=environment_ids).values_list( "group_id", flat=True).distinct()) owner_search = get_search_filter(search_filters, "assigned_or_suggested", "=") if owner_search: qs = qs.filter( assigned_or_suggested_filter(owner_search, projects, field_filter="group_id")) paginator = DateTimePaginator(qs.order_by("date_added"), "-date_added") results = paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) # We want to return groups from the endpoint, but have the cursor be related to the # GroupInbox rows. So we paginate on the GroupInbox results queryset, then fetch # the group_ids out and use them to get the actual groups. group_qs = Group.objects.filter( id__in=[r.group_id for r in results.results], project__in=projects, status=GroupStatus.UNRESOLVED, ) groups: Mapping[int, Group] = {g.id: g for g in group_qs} results.results = [ groups[r.group_id] for r in results.results if r.group_id in groups ] return results