def inbox_search( projects: Sequence[Project], environments: Optional[Sequence[Environment]] = None, limit: int = 100, cursor: Optional[Cursor] = None, count_hits: bool = False, search_filters: Optional[Sequence[SearchFilter]] = None, date_from: Optional[datetime] = None, date_to: Optional[datetime] = None, max_hits: Optional[int] = None, ) -> CursorResult: now: datetime = timezone.now() end: Optional[datetime] = None end_params: List[datetime] = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) end = end if end else now + ALLOWED_FUTURE_DELTA # We only want to search back a week at most, since that's the oldest inbox rows # can be. earliest_date = now - timedelta(days=7) start_params = [ date_from, earliest_date, get_search_filter(search_filters, "date", ">") ] start = max([_f for _f in start_params if _f]) end = max([earliest_date, end]) if start >= end: return Paginator(Group.objects.none()).get_result() # Make sure search terms are valid invalid_search_terms = [ str(sf) for sf in search_filters if sf.key.name not in allowed_inbox_search_terms ] if invalid_search_terms: raise InvalidSearchQuery( f"Invalid search terms for 'inbox' search: {invalid_search_terms}") # Make sure this is an inbox search if not get_search_filter(search_filters, "for_review", "="): raise InvalidSearchQuery( "Sort key 'inbox' only supported for inbox search") if get_search_filter(search_filters, "status", "=") != GroupStatus.UNRESOLVED: raise InvalidSearchQuery( "Inbox search only works for 'unresolved' status") # We just filter on `GroupInbox.date_added` here, and don't filter by date # on the group. This keeps the query simpler and faster in some edge cases, # and date_added is a good enough proxy when we're using this sort. qs = GroupInbox.objects.filter( date_added__gte=start, date_added__lte=end, project__in=projects, ) if environments is not None: environment_ids: List[int] = [ environment.id for environment in environments ] qs = qs.filter(group_id__in=GroupEnvironment.objects.filter( environment_id__in=environment_ids).values_list( "group_id", flat=True).distinct()) owner_search = get_search_filter(search_filters, "assigned_or_suggested", "=") if owner_search: qs = qs.filter( assigned_or_suggested_filter(owner_search, projects, field_filter="group_id")) paginator = DateTimePaginator(qs.order_by("date_added"), "-date_added") results = paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) # We want to return groups from the endpoint, but have the cursor be related to the # GroupInbox rows. So we paginate on the GroupInbox results queryset, then fetch # the group_ids out and use them to get the actual groups. group_qs = Group.objects.filter( id__in=[r.group_id for r in results.results], project__in=projects, status=GroupStatus.UNRESOLVED, ) groups: Mapping[int, Group] = {g.id: g for g in group_qs} results.results = [ groups[r.group_id] for r in results.results if r.group_id in groups ] return results
from __future__ import absolute_import import time from datetime import timedelta from hashlib import md5 from django.utils import timezone from sentry import options from sentry.api.event_search import convert_search_filter_to_snuba_query from sentry.api.paginator import DateTimePaginator, SequencePaginator, Paginator from sentry.constants import ALLOWED_FUTURE_DELTA from sentry.models import Group from sentry.utils import snuba, metrics from sentry.snuba.dataset import Dataset EMPTY_RESULT = Paginator(Group.objects.none()).get_result() # mapping from query parameter sort name to underlying scoring aggregation name sort_strategies = { "date": "last_seen", "freq": "times_seen", "new": "first_seen", "priority": "priority", } dependency_aggregations = {"priority": ["last_seen", "times_seen"]} aggregation_defs = { "times_seen": ["count()", ""], "first_seen": ["multiply(toUInt64(min(timestamp)), 1000)", ""], "last_seen": ["multiply(toUInt64(max(timestamp)), 1000)", ""],
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def empty_result(self): return Paginator(Group.objects.none()).get_result()
def query(self, project, query=None, status=None, tags=None, bookmarked_by=None, assigned_to=None, first_release=None, sort_by='date', date_filter='last_seen', date_from=None, date_to=None, cursor=None, limit=100): from sentry.models import Group queryset = Group.objects.filter(project=project) if query: # TODO(dcramer): if we want to continue to support search on SQL # we should at least optimize this in Postgres so that it does # the query filter **after** the index filters, and restricts the # result set queryset = queryset.filter( Q(message__icontains=query) | Q(culprit__icontains=query)) if status is not None: queryset = queryset.filter(status=status) if bookmarked_by: queryset = queryset.filter( bookmark_set__project=project, bookmark_set__user=bookmarked_by, ) if assigned_to: queryset = queryset.filter( assignee_set__project=project, assignee_set__user=assigned_to, ) if first_release: queryset = queryset.filter( first_release__project=project, first_release__version=first_release, ) if tags: for k, v in tags.iteritems(): queryset = queryset.filter(**dict( grouptag__key=k, grouptag__value=v, )) if date_filter == 'first_seen': if date_from and date_to: queryset = queryset.filter( first_seen__gte=date_from, first_seen__lte=date_to, ) elif date_from: queryset = queryset.filter(first_seen__gte=date_from) elif date_to: queryset = queryset.filter(first_seen__lte=date_to) elif date_filter == 'last_seen': if date_from and date_to: queryset = queryset.filter( first_seen__gte=date_from, last_seen__lte=date_to, ) elif date_from: queryset = queryset.filter(last_seen__gte=date_from) elif date_to: queryset = queryset.filter(last_seen__lte=date_to) engine = get_db_engine('default') if engine.startswith('sqlite'): score_clause = SQLITE_SORT_CLAUSES[sort_by] elif engine.startswith('mysql'): score_clause = MYSQL_SORT_CLAUSES[sort_by] elif engine.startswith('oracle'): score_clause = ORACLE_SORT_CLAUSES[sort_by] elif engine in MSSQL_ENGINES: score_clause = MSSQL_SORT_CLAUSES[sort_by] else: score_clause = SORT_CLAUSES[sort_by] if sort_by == 'tottime': queryset = queryset.filter(time_spent_count__gt=0) elif sort_by == 'avgtime': queryset = queryset.filter(time_spent_count__gt=0) queryset = queryset.extra(select={'sort_value': score_clause}, ) # HACK: don't sort by the same column twice if sort_by == 'date': queryset = queryset.order_by('-sort_value') else: queryset = queryset.order_by('-sort_value', '-last_seen') paginator = Paginator(queryset, '-sort_value') return paginator.get_result(limit, cursor)
def query(self, project, tags=None, environment=None, sort_by='date', limit=100, cursor=None, count_hits=False, paginator_options=None, **parameters): from sentry.models import Group, GroupStatus, GroupSubscription, Release if paginator_options is None: paginator_options = {} if tags is None: tags = {} try: if tags.get('sentry:release') == 'latest': tags['sentry:release'] = get_latest_release( project, environment) if parameters.get('first_release') == 'latest': parameters['first_release'] = get_latest_release( project, environment) except Release.DoesNotExist: # no matches could possibly be found from this point on return Paginator(Group.objects.none()).get_result() group_queryset = QuerySetBuilder({ 'query': CallbackCondition( lambda queryset, query: queryset.filter( Q(message__icontains=query) | Q(culprit__icontains=query), ) if query else queryset, ), 'status': CallbackCondition( lambda queryset, status: queryset.filter(status=status), ), 'bookmarked_by': CallbackCondition( lambda queryset, user: queryset.filter( bookmark_set__project=project, bookmark_set__user=user, ), ), 'assigned_to': CallbackCondition( functools.partial(assigned_to_filter, project=project), ), 'unassigned': CallbackCondition( lambda queryset, unassigned: queryset.filter( assignee_set__isnull=unassigned, ), ), 'subscribed_by': CallbackCondition( lambda queryset, user: queryset.filter( id__in=GroupSubscription.objects.filter( project=project, user=user, is_active=True, ).values_list('group'), ), ), 'active_at_from': ScalarCondition('active_at', 'gt'), 'active_at_to': ScalarCondition('active_at', 'lt'), }).build( Group.objects.filter(project=project).exclude(status__in=[ GroupStatus.PENDING_DELETION, GroupStatus.DELETION_IN_PROGRESS, GroupStatus.PENDING_MERGE, ]), parameters, ) # filter out groups which are beyond the retention period retention = quotas.get_event_retention( organization=project.organization) if retention: retention_window_start = timezone.now() - timedelta(days=retention) else: retention_window_start = None # TODO: This could be optimized when building querysets to identify # criteria that are logically impossible (e.g. if the upper bound # for last seen is before the retention window starts, no results # exist.) if retention_window_start: group_queryset = group_queryset.filter( last_seen__gte=retention_window_start) # This is a punt because the SnubaSearchBackend (a subclass) shares so much that it # seemed better to handle all the shared initialization and then handoff to the # actual backend. return self._query(project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters)
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the past, but apparently # `retention_window_start` can be None(?), so we need a fallback. start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # maximum number of Group IDs to send down to Snuba, # if more Group ID candidates are found, a "bare" Snuba # search is performed and the result groups are then # post-filtered via queries to the Sentry DB max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates') # pre-filter query candidate_ids = None if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_ids) > max_pre_snuba_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_pre_snuba_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_ids = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = Paginator(Group.objects.none()).get_result() result_groups = [] result_group_ids = set() min_score = float('inf') max_score = -1 max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # used for cursor logic min_score = min(min_score, group_score) max_score = max(max_score, group_score) # HACK: If a cursor is being used and there may be more results available # in Snuba, we need to detect whether the cursor's value will be # found in the result groups. If it isn't in the results yet we need to # continue querying before we hand off to the paginator to decide whether # enough results are found or not, otherwise the paginator will happily # return `limit` worth of results that don't take the cursor into account # at all, since it can't know there are more results available. # TODO: If chunked search works in practice we should probably extend the # paginator to throw something if the cursor value is never found, or do # something other than partially leak internal paginator logic up to here. # Or make separate Paginator implementation just for Snuba search? if cursor is not None \ and not candidate_ids \ and more_results: if cursor.is_prev and min_score < cursor.value: continue elif not cursor.is_prev and max_score > cursor.value: continue paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def query(self, project, tags=None, environment=None, sort_by='date', limit=100, cursor=None, count_hits=False, paginator_options=None, **parameters): from sentry.models import (Environment, Event, Group, GroupEnvironment, GroupStatus, GroupSubscription, Release) if paginator_options is None: paginator_options = {} if tags is None: tags = {} try: if tags.get('sentry:release') == 'latest': tags['sentry:release'] = get_latest_release( project, environment) if parameters.get('first_release') == 'latest': parameters['first_release'] = get_latest_release( project, environment) except Release.DoesNotExist: # no matches could possibly be found from this point on return Paginator(Group.objects.none()).get_result() group_queryset = QuerySetBuilder({ 'query': CallbackCondition( lambda queryset, query: queryset.filter( Q(message__icontains=query) | Q(culprit__icontains=query), ) if query else queryset, ), 'status': CallbackCondition( lambda queryset, status: queryset.filter(status=status), ), 'bookmarked_by': CallbackCondition( lambda queryset, user: queryset.filter( bookmark_set__project=project, bookmark_set__user=user, ), ), 'assigned_to': CallbackCondition( functools.partial(assigned_to_filter, project=project), ), 'unassigned': CallbackCondition( lambda queryset, unassigned: queryset.filter( assignee_set__isnull=unassigned, ), ), 'subscribed_by': CallbackCondition( lambda queryset, user: queryset.filter( id__in=GroupSubscription.objects.filter( project=project, user=user, is_active=True, ).values_list('group'), ), ), 'active_at_from': ScalarCondition('active_at', 'gt'), 'active_at_to': ScalarCondition('active_at', 'lt'), }).build( Group.objects.filter(project=project).exclude(status__in=[ GroupStatus.PENDING_DELETION, GroupStatus.DELETION_IN_PROGRESS, GroupStatus.PENDING_MERGE, ]), parameters, ) # filter out groups which are beyond the retention period retention = quotas.get_event_retention( organization=project.organization) if retention: retention_window_start = timezone.now() - timedelta(days=retention) # TODO: This could be optimized when building querysets to identify # criteria that are logically impossible (e.g. if the upper bound # for last seen is before the retention window starts, no results # exist.) group_queryset = group_queryset.filter( last_seen__gte=retention_window_start) else: retention_window_start = None if environment is not None: if 'environment' in tags: # TODO: This should probably just overwrite the existing tag, # rather than asserting on it, but...? assert Environment.objects.get( projects=project, name=tags.pop('environment'), ).id == environment.id event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('date_added', 'gt'), 'date_to': ScalarCondition('date_added', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): event_queryset = event_queryset_builder.build( tagstore.get_event_tag_qs( project.id, environment.id, 'environment', environment.name, ), parameters, ) if retention_window_start is not None: event_queryset = event_queryset.filter( date_added__gte=retention_window_start) group_queryset = group_queryset.filter( id__in=list(event_queryset.distinct().values_list( 'group_id', flat=True)[:1000])) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( get_sql_column(GroupEnvironment, 'first_release_id'), get_sql_column(Release, 'id'), ), '{} = %s'.format( get_sql_column(Release, 'organization'), ), '{} = %s'.format( get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), 'times_seen': CallbackCondition( # This condition represents the exact number of times that # an issue has been seen in an environment. Since an issue # can't be seen in an environment more times than the issue # was seen overall, we can safely exclude any groups that # don't have at least that many events. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), 'times_seen_lower': CallbackCondition( # This condition represents the lower threshold for the # number of times an issue has been seen in an environment. # Since an issue can't be seen in an environment more times # than the issue was seen overall, we can safely exclude # any groups that haven't met that threshold. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), # The following conditions make a few assertions that are are # correct in an abstract sense but may not accurately reflect # the existing implementation (see GH-5289). These assumptions # are that 1. The first seen time for a Group is the minimum # value of the first seen time for all of it's GroupEnvironment # relations; 2. The last seen time for a Group is the maximum # value of the last seen time for all of it's GroupEnvironment # relations; 3. The first seen time is always less than or # equal to the last seen time. 'age_from': CallbackCondition( # This condition represents the lower threshold for "first # seen" time for an environment. Due to assertions #1 and # #3, we can exclude any groups where the "last seen" time # is prior to this timestamp. lambda queryset, first_seen: queryset.exclude( last_seen__lt=first_seen, ), ), 'age_to': CallbackCondition( # This condition represents the upper threshold for "first # seen" time for an environment. Due to assertions #1, we # can exclude any values where the group first seen is # greater than that threshold. lambda queryset, first_seen: queryset.exclude( first_seen__gt=first_seen, ), ), 'last_seen_from': CallbackCondition( # This condition represents the lower threshold for "last # seen" time for an environment. Due to assertion #2, we # can exclude any values where the group last seen value is # less than that threshold. lambda queryset, last_seen: queryset.exclude(last_seen__lt= last_seen, ), ), 'last_seen_to': CallbackCondition( # This condition represents the upper threshold for "last # seen" time for an environment. Due to assertions #2 and # #3, we can exclude any values where the group first seen # value is greater than that threshold. lambda queryset, last_seen: queryset.exclude(first_seen__gt =last_seen, ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( get_sql_column(Group, 'id'), get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[ sort_by] group_tag_value_queryset = tagstore.get_group_tag_value_qs( project.id, set(group_queryset.values_list('id', flat=True)), # TODO: Limit?, environment.id, 'environment', environment.name, ) if retention_window_start is not None: group_tag_value_queryset = group_tag_value_queryset.filter( last_seen__gte=retention_window_start) candidates = dict( QuerySetBuilder({ 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter( times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_tag_value_queryset, parameters, ).extra(select={ 'sort_value': get_sort_expression(group_tag_value_queryset.model), }, ).values_list('group_id', 'sort_value')) if tags: # TODO: `get_group_ids_for_search_filter` should be able to # utilize the retention window start parameter for additional # optimizations. matches = tagstore.get_group_ids_for_search_filter( project.id, environment.id, tags, candidates.keys(), limit=len(candidates), ) for key in set(candidates) - set(matches or []): del candidates[key] result = SequencePaginator([(sort_value_to_cursor_value(score), id) for (id, score) in candidates.items()], reverse=True, **paginator_options).get_result( limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(result.results) result.results = [groups[k] for k in result.results if k in groups] return result else: event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('datetime', 'gt'), 'date_to': ScalarCondition('datetime', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): group_queryset = group_queryset.filter(id__in=list( event_queryset_builder.build( Event.objects.filter(project_id=project.id), parameters, ).distinct().values_list('group_id', flat=True)[:1000], )) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen= times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_queryset, parameters, ).extra(select={ 'sort_value': get_sort_clause(sort_by), }, ) if tags: matches = tagstore.get_group_ids_for_search_filter( project.id, None, tags) if matches: group_queryset = group_queryset.filter(id__in=matches) else: group_queryset = group_queryset.none() paginator_cls, sort_clause = sort_strategies[sort_by] group_queryset = group_queryset.order_by(sort_clause) paginator = paginator_cls(group_queryset, sort_clause, **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits)
def query(self, project, query=None, status=None, tags=None, bookmarked_by=None, assigned_to=None, sort_by='date', date_filter='last_seen', date_from=None, date_to=None, cursor=None, limit=100): from sentry.models import Group queryset = Group.objects.filter(project=project) if query: # TODO(dcramer): if we want to continue to support search on SQL # we should at least optimize this in Postgres so that it does # the query filter **after** the index filters, and restricts the # result set queryset = queryset.filter( Q(message__icontains=query) | Q(culprit__icontains=query) ) if status is not None: queryset = queryset.filter(status=status) if bookmarked_by: queryset = queryset.filter( bookmark_set__project=project, bookmark_set__user=bookmarked_by, ) if assigned_to: queryset = queryset.filter( assignee_set__project=project, assignee_set__user=assigned_to, ) if tags: for k, v in tags.iteritems(): queryset = queryset.filter(**dict( grouptag__key=k, grouptag__value=v, )) if date_filter == 'first_seen': if date_from and date_to: queryset = queryset.filter( first_seen__gte=date_from, first_seen__lte=date_to, ) elif date_from: queryset = queryset.filter(first_seen__gte=date_from) elif date_to: queryset = queryset.filter(first_seen__lte=date_to) elif date_filter == 'last_seen': if date_from and date_to: queryset = queryset.filter( first_seen__gte=date_from, last_seen__lte=date_to, ) elif date_from: queryset = queryset.filter(last_seen__gte=date_from) elif date_to: queryset = queryset.filter(last_seen__lte=date_to) engine = get_db_engine('default') if engine.startswith('sqlite'): score_clause = SQLITE_SORT_CLAUSES[sort_by] elif engine.startswith('mysql'): score_clause = MYSQL_SORT_CLAUSES[sort_by] elif engine.startswith('oracle'): score_clause = ORACLE_SORT_CLAUSES[sort_by] elif engine in MSSQL_ENGINES: score_clause = MSSQL_SORT_CLAUSES[sort_by] else: score_clause = SORT_CLAUSES[sort_by] if sort_by == 'tottime': queryset = queryset.filter(time_spent_count__gt=0) elif sort_by == 'avgtime': queryset = queryset.filter(time_spent_count__gt=0) queryset = queryset.extra( select={'sort_value': score_clause}, ) # HACK: don't sort by the same column twice if sort_by == 'date': queryset = queryset.order_by('-sort_value') else: queryset = queryset.order_by('-sort_value', '-last_seen') paginator = Paginator(queryset, '-sort_value') return paginator.get_result(limit, cursor)
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the past, but apparently # `retention_window_start` can be None(?), so we need a fallback. start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # maximum number of GroupHashes to send down to Snuba, # if more GroupHash candidates are found, a "bare" Snuba # search is performed and the result groups are then # post-filtered via queries to the Sentry DB max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates') # pre-filter query candidate_hashes = None if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates: candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:max_pre_snuba_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > max_pre_snuba_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_pre_snuba_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = Paginator(Group.objects.none()).get_result() result_groups = [] result_group_ids = set() min_score = float('inf') max_score = -1 max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's hashes and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_hashes always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_hashes) if candidate_hashes else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # used for cursor logic min_score = min(min_score, group_score) max_score = max(max_score, group_score) # HACK: If a cursor is being used and there may be more results available # in Snuba, we need to detect whether the cursor's value will be # found in the result groups. If it isn't in the results yet we need to # continue querying before we hand off to the paginator to decide whether # enough results are found or not, otherwise the paginator will happily # return `limit` worth of results that don't take the cursor into account # at all, since it can't know there are more results available. # TODO: If chunked search works in practice we should probably extend the # paginator to throw something if the cursor value is never found, or do # something other than partially leak internal paginator logic up to here. # Or make separate Paginator implementation just for Snuba search? if cursor is not None \ and not candidate_hashes \ and more_results: if cursor.is_prev and min_score < cursor.value: continue elif not cursor.is_prev and max_score > cursor.value: continue paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_hashes \ or len(paginator_results.results) >= limit \ or not more_results: break metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def query(self, projects, tags=None, environments=None, sort_by='date', limit=100, cursor=None, count_hits=False, paginator_options=None, search_filters=None, use_new_filters=False, **parameters): from sentry.models import Group, GroupStatus, GroupSubscription, Release search_filters = search_filters if search_filters is not None else [] # ensure projects are from same org if len({p.organization_id for p in projects}) != 1: raise RuntimeError('Cross organization search not supported') if paginator_options is None: paginator_options = {} if tags is None: tags = {} try: if tags.get('sentry:release') == 'latest': tags['sentry:release'] = get_latest_release( projects, environments) if parameters.get('first_release') == 'latest': parameters['first_release'] = get_latest_release( projects, environments) except Release.DoesNotExist: # no matches could possibly be found from this point on return Paginator(Group.objects.none()).get_result() group_queryset = Group.objects.filter(project__in=projects).exclude( status__in=[ GroupStatus.PENDING_DELETION, GroupStatus.DELETION_IN_PROGRESS, GroupStatus.PENDING_MERGE, ]) if use_new_filters: query_set_builder_class = SearchFilterQuerySetBuilder query_set_builder_params = search_filters else: query_set_builder_class = NewQuerySetBuilder query_set_builder_params = parameters group_queryset = query_set_builder_class({ 'message': QCallbackCondition( lambda query: Q(Q(message__icontains=query) | Q(culprit__icontains=query), ), skip_if_falsey=True, ), # TODO: Remove this once we've stopped using old params 'query': QCallbackCondition( lambda query: Q(Q(message__icontains=query) | Q(culprit__icontains=query), ), skip_if_falsey=True, ), 'status': QCallbackCondition(lambda status: Q(status=status), ), 'bookmarked_by': QCallbackCondition( lambda user: Q( bookmark_set__project__in=projects, bookmark_set__user=user, ), ), 'assigned_to': QCallbackCondition( functools.partial(assigned_to_filter, projects=projects), ), 'unassigned': QCallbackCondition( functools.partial(unassigned_filter, projects=projects), ), 'subscribed_by': QCallbackCondition( lambda user: Q(id__in=GroupSubscription.objects.filter( project__in=projects, user=user, is_active=True, ).values_list('group'), ), ), 'active_at': SearchFilterScalarCondition('active_at'), # TODO: These are legacy params. Once we've moved to SearchFilter # entirely then they can be removed, since the `'active_at'` # condition will handle both 'active_at_from': ScalarCondition('active_at', 'gt'), 'active_at_to': ScalarCondition('active_at', 'lt'), }).build(group_queryset, query_set_builder_params) # filter out groups which are beyond the retention period retention = quotas.get_event_retention( organization=projects[0].organization) if retention: retention_window_start = timezone.now() - timedelta(days=retention) else: retention_window_start = None # TODO: This could be optimized when building querysets to identify # criteria that are logically impossible (e.g. if the upper bound # for last seen is before the retention window starts, no results # exist.) if retention_window_start: group_queryset = group_queryset.filter( last_seen__gte=retention_window_start) # This is a punt because the SnubaSearchBackend (a subclass) shares so much that it # seemed better to handle all the shared initialization and then handoff to the # actual backend. return self._query(projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, use_new_filters, **parameters)