def _query(self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to): # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: environment_ids = [environment.id for environment in environments] group_queryset = group_queryset.filter( groupenvironment__environment_id__in=environment_ids ) group_queryset = QuerySetBuilder({ 'first_release': QCallbackCondition( lambda version: Q( groupenvironment__first_release__organization_id=projects[0].organization_id, groupenvironment__first_release__version=version, groupenvironment__environment_id__in=environment_ids, ) ), 'first_seen': ScalarCondition( 'groupenvironment__first_seen', {'groupenvironment__environment_id__in': environment_ids} ), }).build(group_queryset, search_filters) else: group_queryset = QuerySetBuilder({ 'first_release': QCallbackCondition( lambda version: Q( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), 'first_seen': ScalarCondition('first_seen'), }).build(group_queryset, search_filters) now = timezone.now() end = None end_params = filter( None, [date_to, get_search_filter(search_filters, 'date', '<')], ) if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if ( cursor is None and sort_by == 'date' and not environments and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in issue_only_fields.union(['date', 'message']) ] ): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) # TODO: We should try and consolidate all this logic together a little # better, maybe outside the backend. Should be easier once we're on # just the new search filters start_params = [ date_from, retention_date, get_search_filter(search_filters, 'date', '>'), ] start = max(filter(None, start_params)) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get('snuba.search.max-pre-snuba-candidates') too_many_candidates = False candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) too_many_candidates = True candidate_ids = [] sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() if count_hits and (too_many_candidates or cursor is not None): # If we had too many candidates to reasonably pass down to snuba, # or if we have a cursor that bisects the overall result set (such # that our query only sees results on one side of the cursor) then # we need an alternative way to figure out the total hits that this # query has. # To do this, we get a sample of groups matching the snuba side of # the query, and see how many of those pass the post-filter in # postgres. This should give us an estimate of the total number of # snuba matches that will be overall matches, which we can use to # get an estimate for X-Hits. # The sampling is not simple random sampling. It will return *all* # matching groups if there are less than N groups matching the # query, or it will return a random, deterministic subset of N of # the groups if there are more than N overall matches. This means # that the "estimate" is actually an accurate result when there are # less than N matching groups. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get('snuba.search.hits-sample-size') snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, search_filters=search_filters, ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids)) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # TODO: If the query didn't include anything to significantly filter # down the number of groups at this point ('first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then this # queryset might return a *huge* number of groups. In this case, we # probably *don't* want to pass candidates down to Snuba, and rather we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba. # # However, if this did filter down the number of groups significantly, # then passing in candidates is, of course, valuable. # # Should we decide which way to handle it based on the number of # group_ids, the number of hashes? Or should we just always start the # query with Snuba? Something else? candidate_group_ids = list(group_queryset.values_list('id', flat=True)) sort, extra_aggregations, calculate_cursor_for_group = sort_strategies[sort_by] group_data = do_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, candidates=candidate_group_ids, **parameters ) group_to_score = {} for group_id, data in group_data.items(): group_to_score[group_id] = calculate_cursor_for_group(data) paginator_results = SequencePaginator( [(score, id) for (id, score) in group_to_score.items()], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): from sentry.models import (Group, Environment, Event, GroupEnvironment, Release) # this backend only supports search within one project/environment if len(projects) != 1 or (environments is not None and len(environments) > 1): raise NotImplementedError project = projects[0] environment = environments[0] if environments is not None else environments if environment is not None: if 'environment' in tags: environment_name = tags.pop('environment') assert environment_name is ANY or Environment.objects.get( projects=project, name=environment_name, ).id == environment.id event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('date_added', 'gt'), 'date_to': ScalarCondition('date_added', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): event_queryset = event_queryset_builder.build( tagstore.get_event_tag_qs( project_id=project.id, environment_id=environment.id, key='environment', value=environment.name, ), parameters, ) if retention_window_start is not None: event_queryset = event_queryset.filter(date_added__gte=retention_window_start) group_queryset = group_queryset.filter( id__in=list(event_queryset.distinct().values_list('group_id', flat=True)[:1000]) ) _, group_queryset_sort_clause = sort_strategies[sort_by] group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( get_sql_column(GroupEnvironment, 'first_release_id'), get_sql_column(Release, 'id'), ), '{} = %s'.format( get_sql_column(Release, 'organization'), ), '{} = %s'.format( get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), 'times_seen': CallbackCondition( # This condition represents the exact number of times that # an issue has been seen in an environment. Since an issue # can't be seen in an environment more times than the issue # was seen overall, we can safely exclude any groups that # don't have at least that many events. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), 'times_seen_lower': CallbackCondition( # This condition represents the lower threshold for the # number of times an issue has been seen in an environment. # Since an issue can't be seen in an environment more times # than the issue was seen overall, we can safely exclude # any groups that haven't met that threshold. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), # The following conditions make a few assertions that are are # correct in an abstract sense but may not accurately reflect # the existing implementation (see GH-5289). These assumptions # are that 1. The first seen time for a Group is the minimum # value of the first seen time for all of it's GroupEnvironment # relations; 2. The last seen time for a Group is the maximum # value of the last seen time for all of it's GroupEnvironment # relations; 3. The first seen time is always less than or # equal to the last seen time. 'age_from': CallbackCondition( # This condition represents the lower threshold for "first # seen" time for an environment. Due to assertions #1 and # #3, we can exclude any groups where the "last seen" time # is prior to this timestamp. lambda queryset, first_seen: queryset.exclude( last_seen__lt=first_seen, ), ), 'age_to': CallbackCondition( # This condition represents the upper threshold for "first # seen" time for an environment. Due to assertions #1, we # can exclude any values where the group first seen is # greater than that threshold. lambda queryset, first_seen: queryset.exclude( first_seen__gt=first_seen, ), ), 'last_seen_from': CallbackCondition( # This condition represents the lower threshold for "last # seen" time for an environment. Due to assertion #2, we # can exclude any values where the group last seen value is # less than that threshold. lambda queryset, last_seen: queryset.exclude( last_seen__lt=last_seen, ), ), 'last_seen_to': CallbackCondition( # This condition represents the upper threshold for "last # seen" time for an environment. Due to assertions #2 and # #3, we can exclude any values where the group first seen # value is greater than that threshold. lambda queryset, last_seen: queryset.exclude( first_seen__gt=last_seen, ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( get_sql_column(Group, 'id'), get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ).order_by(group_queryset_sort_clause) get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[sort_by] group_tag_value_queryset = tagstore.get_group_tag_value_qs( project_id=project.id, group_id=set(group_queryset.values_list('id', flat=True)[:10000]), environment_id=environment.id, key='environment', value=environment.name, ) if retention_window_start is not None: group_tag_value_queryset = group_tag_value_queryset.filter( last_seen__gte=retention_window_start ) candidates = dict( QuerySetBuilder({ 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_tag_value_queryset, parameters, ).extra( select={ 'sort_value': get_sort_expression(group_tag_value_queryset.model), }, ).values_list('group_id', 'sort_value') ) if tags: # TODO: `get_group_ids_for_search_filter` should be able to # utilize the retention window start parameter for additional # optimizations. matches = tagstore.get_group_ids_for_search_filter( project_id=project.id, environment_id=environment.id, tags=tags, candidates=candidates.keys(), limit=len(candidates), ) for key in set(candidates) - set(matches or []): del candidates[key] result = SequencePaginator( [(sort_value_to_cursor_value(score), id) for (id, score) in candidates.items()], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(result.results) result.results = [groups[k] for k in result.results if k in groups] return result else: event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('datetime', 'gt'), 'date_to': ScalarCondition('datetime', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): group_queryset = group_queryset.filter( id__in=list( event_queryset_builder.build( Event.objects.filter(project_id=project.id), parameters, ).distinct().values_list('group_id', flat=True)[:1000], ) ) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_queryset, parameters, ).extra( select={ 'sort_value': get_sort_clause(sort_by), }, ) if tags: group_ids = tagstore.get_group_ids_for_search_filter( project_id=project.id, environment_id=None, tags=tags, candidates=None, ) if group_ids: group_queryset = group_queryset.filter(id__in=group_ids) else: group_queryset = group_queryset.none() paginator_cls, sort_clause = sort_strategies[sort_by] group_queryset = group_queryset.order_by(sort_clause) paginator = paginator_cls(group_queryset, sort_clause, **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits)
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, ): now = timezone.now() end = None end_params = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and not environments and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in self.postgres_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max([ _f for _f in [retention_window_start, now - timedelta(days=90)] if _f ]) start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max([_f for _f in start_params if _f]) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") with sentry_sdk.start_span(op="snuba_group_query") as span: group_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) span.set_data("Max Candidates", max_candidates) span.set_data("Result Size", len(group_ids)) metrics.timing("snuba.search.num_candidates", len(group_ids)) too_many_candidates = False if not group_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return self.empty_result elif len(group_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True group_ids = [] sort_field = self.sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = self.calculate_hits( group_ids, too_many_candidates, sort_field, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, start, end, ) if count_hits and hits == 0: return self.empty_result paginator_results = self.empty_result result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have group_ids always query for at least that many items chunk_limit = max(chunk_limit, len(group_ids)) # {group_id: group_score, ...} snuba_groups, total = self.snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, group_ids=group_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if group_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the group_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits) if group_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) return paginator.get_result(limit, cursor, count_hits=False) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # num_candidates is the number of Group IDs to send down to Snuba, if # more Group ID candidates are found, a "bare" Snuba search is performed # and the result groups are then post-filtered via queries to the Sentry DB optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer') if optimizer_enabled: missed_projects = [] keys = [self._get_project_count_cache_key(p.id) for p in projects] counts_by_projects = { self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items() } missed_projects = {p.id for p in projects} - set(counts_by_projects.keys()) if missed_projects: missing_counts = snuba.query( start=max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ), end=now, groupby=['project_id'], filter_keys={ 'project_id': list(missed_projects), }, aggregations=[['uniq', 'group_id', 'group_count']], referrer='search', ) cache.set_many({ self._get_project_count_cache_key(project_id): count for project_id, count in missing_counts.items() }, options.get('snuba.search.project-group-count-cache-time')) counts_by_projects.update(missing_counts) min_candidates = options.get('snuba.search.min-pre-snuba-candidates') max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage') num_candidates = max( min_candidates, min( max_candidates, sum(counts_by_projects.values()) * candidates_percentage ) ) else: num_candidates = options.get('snuba.search.min-pre-snuba-candidates') # pre-filter query candidate_ids = None if num_candidates and limit <= num_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:num_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > num_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `num_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to): # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: environment_ids = [environment.id for environment in environments] group_queryset = group_queryset.filter( groupenvironment__environment_id__in=environment_ids) group_queryset = QuerySetBuilder({ 'first_release': QCallbackCondition(lambda version: Q( groupenvironment__first_release__organization_id=projects[ 0].organization_id, groupenvironment__first_release__version=version, groupenvironment__environment_id__in=environment_ids, )), 'first_seen': ScalarCondition( 'groupenvironment__first_seen', {'groupenvironment__environment_id__in': environment_ids}), }).build(group_queryset, search_filters) else: group_queryset = QuerySetBuilder({ 'first_release': QCallbackCondition( lambda version: Q( first_release__organization_id=projects[0]. organization_id, first_release__version=version, ), ), 'first_seen': ScalarCondition('first_seen'), }).build(group_queryset, search_filters) now = timezone.now() end = None end_params = filter( None, [date_to, get_search_filter(search_filters, 'date', '<')], ) if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == 'date' and not environments and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in issue_only_fields.union(['date', 'message']) ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [retention_window_start, now - timedelta(days=90)])) # TODO: We should try and consolidate all this logic together a little # better, maybe outside the backend. Should be easier once we're on # just the new search filters start_params = [ date_from, retention_date, get_search_filter(search_filters, 'date', '>'), ] start = max(filter(None, start_params)) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get('snuba.search.max-pre-snuba-candidates') too_many_candidates = False candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_candidates + 1]) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) too_many_candidates = True candidate_ids = [] sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() if count_hits and (too_many_candidates or cursor is not None): # If we had too many candidates to reasonably pass down to snuba, # or if we have a cursor that bisects the overall result set (such # that our query only sees results on one side of the cursor) then # we need an alternative way to figure out the total hits that this # query has. # To do this, we get a sample of groups matching the snuba side of # the query, and see how many of those pass the post-filter in # postgres. This should give us an estimate of the total number of # snuba matches that will be overall matches, which we can use to # get an estimate for X-Hits. # The sampling is not simple random sampling. It will return *all* # matching groups if there are less than N groups matching the # query, or it will return a random, deterministic subset of N of # the groups if there are more than N overall matches. This means # that the "estimate" is actually an accurate result when there are # less than N matching groups. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get('snuba.search.hits-sample-size') snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, search_filters=search_filters, ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids)) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): from sentry.models import (Group, Environment, Event, GroupEnvironment, Release) if environment is not None: if 'environment' in tags: environment_name = tags.pop('environment') assert environment_name is ANY or Environment.objects.get( projects=project, name=environment_name, ).id == environment.id event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('date_added', 'gt'), 'date_to': ScalarCondition('date_added', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): event_queryset = event_queryset_builder.build( tagstore.get_event_tag_qs( project_id=project.id, environment_id=environment.id, key='environment', value=environment.name, ), parameters, ) if retention_window_start is not None: event_queryset = event_queryset.filter( date_added__gte=retention_window_start) group_queryset = group_queryset.filter( id__in=list(event_queryset.distinct().values_list( 'group_id', flat=True)[:1000])) _, group_queryset_sort_clause = sort_strategies[sort_by] group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( get_sql_column(GroupEnvironment, 'first_release_id'), get_sql_column(Release, 'id'), ), '{} = %s'.format( get_sql_column(Release, 'organization'), ), '{} = %s'.format( get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), 'times_seen': CallbackCondition( # This condition represents the exact number of times that # an issue has been seen in an environment. Since an issue # can't be seen in an environment more times than the issue # was seen overall, we can safely exclude any groups that # don't have at least that many events. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), 'times_seen_lower': CallbackCondition( # This condition represents the lower threshold for the # number of times an issue has been seen in an environment. # Since an issue can't be seen in an environment more times # than the issue was seen overall, we can safely exclude # any groups that haven't met that threshold. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), # The following conditions make a few assertions that are are # correct in an abstract sense but may not accurately reflect # the existing implementation (see GH-5289). These assumptions # are that 1. The first seen time for a Group is the minimum # value of the first seen time for all of it's GroupEnvironment # relations; 2. The last seen time for a Group is the maximum # value of the last seen time for all of it's GroupEnvironment # relations; 3. The first seen time is always less than or # equal to the last seen time. 'age_from': CallbackCondition( # This condition represents the lower threshold for "first # seen" time for an environment. Due to assertions #1 and # #3, we can exclude any groups where the "last seen" time # is prior to this timestamp. lambda queryset, first_seen: queryset.exclude( last_seen__lt=first_seen, ), ), 'age_to': CallbackCondition( # This condition represents the upper threshold for "first # seen" time for an environment. Due to assertions #1, we # can exclude any values where the group first seen is # greater than that threshold. lambda queryset, first_seen: queryset.exclude( first_seen__gt=first_seen, ), ), 'last_seen_from': CallbackCondition( # This condition represents the lower threshold for "last # seen" time for an environment. Due to assertion #2, we # can exclude any values where the group last seen value is # less than that threshold. lambda queryset, last_seen: queryset.exclude(last_seen__lt= last_seen, ), ), 'last_seen_to': CallbackCondition( # This condition represents the upper threshold for "last # seen" time for an environment. Due to assertions #2 and # #3, we can exclude any values where the group first seen # value is greater than that threshold. lambda queryset, last_seen: queryset.exclude(first_seen__gt =last_seen, ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( get_sql_column(Group, 'id'), get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ).order_by(group_queryset_sort_clause) get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[ sort_by] group_tag_value_queryset = tagstore.get_group_tag_value_qs( project_id=project.id, group_id=set( group_queryset.values_list('id', flat=True)[:10000]), environment_id=environment.id, key='environment', value=environment.name, ) if retention_window_start is not None: group_tag_value_queryset = group_tag_value_queryset.filter( last_seen__gte=retention_window_start) candidates = dict( QuerySetBuilder({ 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter( times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_tag_value_queryset, parameters, ).extra(select={ 'sort_value': get_sort_expression(group_tag_value_queryset.model), }, ).values_list('group_id', 'sort_value')) if tags: # TODO: `get_group_ids_for_search_filter` should be able to # utilize the retention window start parameter for additional # optimizations. matches = tagstore.get_group_ids_for_search_filter( project_id=project.id, environment_id=environment.id, tags=tags, candidates=candidates.keys(), limit=len(candidates), ) for key in set(candidates) - set(matches or []): del candidates[key] result = SequencePaginator([(sort_value_to_cursor_value(score), id) for (id, score) in candidates.items()], reverse=True, **paginator_options).get_result( limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(result.results) result.results = [groups[k] for k in result.results if k in groups] return result else: event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('datetime', 'gt'), 'date_to': ScalarCondition('datetime', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): group_queryset = group_queryset.filter(id__in=list( event_queryset_builder.build( Event.objects.filter(project_id=project.id), parameters, ).distinct().values_list('group_id', flat=True)[:1000], )) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen= times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_queryset, parameters, ).extra(select={ 'sort_value': get_sort_clause(sort_by), }, ) if tags: group_ids = tagstore.get_group_ids_for_search_filter( project_id=project.id, environment_id=None, tags=tags, candidates=None, ) if group_ids: group_queryset = group_queryset.filter(id__in=group_ids) else: group_queryset = group_queryset.none() paginator_cls, sort_clause = sort_strategies[sort_by] group_queryset = group_queryset.order_by(sort_clause) paginator = paginator_cls(group_queryset, sort_clause, **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits)
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # TODO: If the query didn't include anything to significantly filter # down the number of groups at this point ('first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then this # queryset might return a *huge* number of groups. In this case, we # probably *don't* want to pass candidates down to Snuba, and rather we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba. # # However, if this did filter down the number of groups significantly, # then passing in candidates is, of course, valuable. # # Should we decide which way to handle it based on the number of # group_ids, the number of hashes? Or should we just always start the # query with Snuba? Something else? candidate_group_ids = list(group_queryset.values_list('id', flat=True)) sort_expression, calculate_cursor_for_group = sort_strategies[sort_by] group_data = do_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort_expression, candidates=candidate_group_ids, **parameters ) group_to_score = {} for group_id, data in group_data.items(): group_to_score[group_id] = calculate_cursor_for_group(data) paginator_results = SequencePaginator( [(score, id) for (id, score) in group_to_score.items()], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def query( self, projects: Sequence[Project], retention_window_start: Optional[datetime], group_queryset: QuerySet, environments: Sequence[Environment], sort_by: str, limit: int, cursor: Optional[Cursor], count_hits: bool, paginator_options: Mapping[str, Any], search_filters: Sequence[SearchFilter], date_from: Optional[datetime], date_to: Optional[datetime], max_hits: Optional[int] = None, ) -> CursorResult: if not validate_cdc_search_filters(search_filters): raise InvalidQueryForExecutor( "Search filters invalid for this query executor") start, end, retention_date = self.calculate_start_end( retention_window_start, search_filters, date_from, date_to) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result e_event = self.entities["event"] e_group = self.entities["group"] where_conditions = [ Condition(Column("project_id", e_event), Op.IN, [p.id for p in projects]), Condition(Column("timestamp", e_event), Op.GTE, start), Condition(Column("timestamp", e_event), Op.LT, end), ] # TODO: This is still basically only handling status, handle this better once we introduce # more conditions. for search_filter in search_filters: where_conditions.append( Condition(Column(search_filter.key.name, e_group), Op.IN, search_filter.value.raw_value)) if environments: # TODO: Should this be handled via filter_keys, once we have a snql compatible version? where_conditions.append( Condition(Column("environment", e_event), Op.IN, [e.name for e in environments])) sort_func = self.aggregation_defs[self.sort_strategies[sort_by]] having = [] if cursor is not None: op = Op.GTE if cursor.is_prev else Op.LTE having.append(Condition(sort_func, op, cursor.value)) query = Query( "events", match=Join([Relationship(e_event, "grouped", e_group)]), select=[ Column("id", e_group), replace(sort_func, alias="score"), ], where=where_conditions, groupby=[Column("id", e_group)], having=having, orderby=[OrderBy(sort_func, direction=Direction.DESC)], limit=Limit(limit + 1), ) data = snuba.raw_snql_query( query, referrer="search.snuba.cdc_search.query")["data"] hits_query = Query( "events", match=Join([Relationship(e_event, "grouped", e_group)]), select=[ Function("uniq", [Column("id", e_group)], alias="count"), ], where=where_conditions, ) hits = None if count_hits: hits = snuba.raw_snql_query( hits_query, referrer="search.snuba.cdc_search.hits")["data"][0]["count"] paginator_results = SequencePaginator( [(row["score"], row["g.id"]) for row in data], reverse=True, **paginator_options, ).get_result(limit, cursor, known_hits=hits, max_hits=max_hits) # We filter against `group_queryset` here so that we recheck all conditions in Postgres. # Since replay between Postgres and Clickhouse can happen, we might get back results that # have changed state in Postgres. By rechecking them we guarantee than any returned results # have the correct state. # TODO: This can result in us returning less than a full page of results, but shouldn't # affect cursors. If we want to, we can iterate and query snuba until we manage to get a # full page. In practice, this will likely only skip a couple of results at worst, and # probably not be noticeable to the user, so holding off for now to reduce complexity. groups = group_queryset.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def query(self, project, tags=None, environment=None, sort_by='date', limit=100, cursor=None, count_hits=False, paginator_options=None, **parameters): from sentry.models import (Environment, Event, Group, GroupEnvironment, GroupStatus, GroupSubscription, Release) if paginator_options is None: paginator_options = {} if tags is None: tags = {} group_queryset = QuerySetBuilder({ 'query': CallbackCondition( lambda queryset, query: queryset.filter( Q(message__icontains=query) | Q(culprit__icontains=query), ) if query else queryset, ), 'status': CallbackCondition( lambda queryset, status: queryset.filter(status=status), ), 'bookmarked_by': CallbackCondition( lambda queryset, user: queryset.filter( bookmark_set__project=project, bookmark_set__user=user, ), ), 'assigned_to': CallbackCondition( functools.partial(assigned_to_filter, project=project), ), 'unassigned': CallbackCondition( lambda queryset, unassigned: queryset.filter( assignee_set__isnull=unassigned, ), ), 'subscribed_by': CallbackCondition( lambda queryset, user: queryset.filter( id__in=GroupSubscription.objects.filter( project=project, user=user, is_active=True, ).values_list('group'), ), ), 'active_at_from': ScalarCondition('active_at', 'gt'), 'active_at_to': ScalarCondition('active_at', 'lt'), }).build( Group.objects.filter(project=project).exclude(status__in=[ GroupStatus.PENDING_DELETION, GroupStatus.DELETION_IN_PROGRESS, GroupStatus.PENDING_MERGE, ]), parameters, ) # filter out groups which are beyond the retention period retention = quotas.get_event_retention( organization=project.organization) if retention: retention_window_start = timezone.now() - timedelta(days=retention) # TODO: This could be optimized when building querysets to identify # criteria that are logically impossible (e.g. if the upper bound # for last seen is before the retention window starts, no results # exist.) group_queryset = group_queryset.filter( last_seen__gte=retention_window_start) else: retention_window_start = None if environment is not None: if 'environment' in tags: # TODO: This should probably just overwrite the existing tag, # rather than asserting on it, but...? assert Environment.objects.get( projects=project, name=tags.pop('environment'), ).id == environment.id event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('date_added', 'gt'), 'date_to': ScalarCondition('date_added', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): event_queryset = event_queryset_builder.build( tagstore.get_event_tag_qs( project.id, environment.id, 'environment', environment.name, ), parameters, ) if retention_window_start is not None: event_queryset = event_queryset.filter( date_added__gte=retention_window_start) group_queryset = group_queryset.filter( id__in=list(event_queryset.distinct().values_list( 'group_id', flat=True)[:1000])) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( get_sql_column(GroupEnvironment, 'first_release_id'), get_sql_column(Release, 'id'), ), '{} = %s'.format( get_sql_column(Release, 'organization'), ), '{} = %s'.format( get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), 'times_seen': CallbackCondition( # This condition represents the exact number of times that # an issue has been seen in an environment. Since an issue # can't be seen in an environment more times than the issue # was seen overall, we can safely exclude any groups that # don't have at least that many events. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), 'times_seen_lower': CallbackCondition( # This condition represents the lower threshold for the # number of times an issue has been seen in an environment. # Since an issue can't be seen in an environment more times # than the issue was seen overall, we can safely exclude # any groups that haven't met that threshold. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), # The following conditions make a few assertions that are are # correct in an abstract sense but may not accurately reflect # the existing implementation (see GH-5289). These assumptions # are that 1. The first seen time for a Group is the minimum # value of the first seen time for all of it's GroupEnvironment # relations; 2. The last seen time for a Group is the maximum # value of the last seen time for all of it's GroupEnvironment # relations; 3. The first seen time is always less than or # equal to the last seen time. 'age_from': CallbackCondition( # This condition represents the lower threshold for "first # seen" time for an environment. Due to assertions #1 and # #3, we can exclude any groups where the "last seen" time # is prior to this timestamp. lambda queryset, first_seen: queryset.exclude( last_seen__lt=first_seen, ), ), 'age_to': CallbackCondition( # This condition represents the upper threshold for "first # seen" time for an environment. Due to assertions #1, we # can exclude any values where the group first seen is # greater than that threshold. lambda queryset, first_seen: queryset.exclude( first_seen__gt=first_seen, ), ), 'last_seen_from': CallbackCondition( # This condition represents the lower threshold for "last # seen" time for an environment. Due to assertion #2, we # can exclude any values where the group last seen value is # less than that threshold. lambda queryset, last_seen: queryset.exclude(last_seen__lt= last_seen, ), ), 'last_seen_to': CallbackCondition( # This condition represents the upper threshold for "last # seen" time for an environment. Due to assertions #2 and # #3, we can exclude any values where the group first seen # value is greater than that threshold. lambda queryset, last_seen: queryset.exclude(first_seen__gt =last_seen, ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( get_sql_column(Group, 'id'), get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[ sort_by] group_tag_value_queryset = tagstore.get_group_tag_value_qs( project.id, set(group_queryset.values_list('id', flat=True)), # TODO: Limit?, environment.id, 'environment', environment.name, ) if retention_window_start is not None: group_tag_value_queryset = group_tag_value_queryset.filter( last_seen__gte=retention_window_start) candidates = dict( QuerySetBuilder({ 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter( times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_tag_value_queryset, parameters, ).extra(select={ 'sort_value': get_sort_expression(group_tag_value_queryset.model), }, ).values_list('group_id', 'sort_value')) if tags: # TODO: `get_group_ids_for_search_filter` should be able to # utilize the retention window start parameter for additional # optimizations. matches = tagstore.get_group_ids_for_search_filter( project.id, environment.id, tags, candidates.keys(), limit=len(candidates), ) for key in set(candidates) - set(matches or []): del candidates[key] result = SequencePaginator([(sort_value_to_cursor_value(score), id) for (id, score) in candidates.items()], reverse=True, **paginator_options).get_result( limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(result.results) result.results = [groups[k] for k in result.results if k in groups] return result else: event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('datetime', 'gt'), 'date_to': ScalarCondition('datetime', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): group_queryset = group_queryset.filter(id__in=list( event_queryset_builder.build( Event.objects.filter(project_id=project.id), parameters, ).distinct().values_list('group_id', flat=True)[:1000], )) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen= times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_queryset, parameters, ).extra(select={ 'sort_value': get_sort_clause(sort_by), }, ) if tags: matches = tagstore.get_group_ids_for_search_filter( project.id, None, tags) if matches: group_queryset = group_queryset.filter(id__in=matches) else: group_queryset = group_queryset.none() paginator_cls, sort_clause = sort_strategies[sort_by] group_queryset = group_queryset.order_by(sort_clause) paginator = paginator_cls(group_queryset, sort_clause, **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits)