def get_group_tag_value_paginator( self, project_id, group_id, environment_ids, key, order_by="-id" ): from sentry.api.paginator import SequencePaginator if order_by in ("-last_seen", "-first_seen", "-times_seen"): pass elif order_by == "-id": # Snuba has no unique id per GroupTagValue so we'll substitute `-first_seen` order_by = "-first_seen" else: raise ValueError("Unsupported order_by: %s" % order_by) group_tag_values = self.get_group_tag_value_iter(project_id, group_id, environment_ids, key) desc = order_by.startswith("-") score_field = order_by.lstrip("-") if score_field == "times_seen": return SequencePaginator( [(int(getattr(gtv, score_field)), gtv) for gtv in group_tag_values], reverse=desc, ) return SequencePaginator( [ (int(to_timestamp(getattr(gtv, score_field)) * 1000), gtv) for gtv in group_tag_values ], reverse=desc, )
def test_empty_results(self): paginator = SequencePaginator([]) result = paginator.get_result(5) assert list(result) == [] assert result.prev == Cursor(0, 0, True, False) assert result.next == Cursor(0, 0, False, False) paginator = SequencePaginator([], reverse=True) result = paginator.get_result(5) assert list(result) == [] assert result.prev == Cursor(0, 0, True, False) assert result.next == Cursor(0, 0, False, False)
def get(self, request, organization, key): if not TAG_KEY_RE.match(key): return Response( {"detail": 'Invalid tag key format for "%s"' % (key, )}, status=400) try: filter_params = self.get_filter_params(request, organization) except NoProjects: paginator = SequencePaginator([]) else: paginator = tagstore.get_tag_value_paginator_for_projects( filter_params["project_id"], filter_params.get("environment"), key, filter_params["start"], filter_params["end"], query=request.GET.get("query"), ) return self.paginate( request=request, paginator=paginator, on_results=lambda results: serialize(results, request.user), )
def _get_tag_values_for_release_stages(self, projects, environments, query): from sentry.api.paginator import SequencePaginator organization_id = Project.objects.filter(id=projects[0]).values_list( "organization_id", flat=True )[0] versions = Release.objects.filter_by_stage( organization_id, "=", query, project_ids=projects, environments=environments, ) if environments: versions = versions.filter( id__in=ReleaseEnvironment.objects.filter( environment_id__in=environments ).values_list("release_id", flat=True) ) versions = versions.order_by("version").values_list("version", flat=True)[:1000] return SequencePaginator( [ (i, TagValue(RELEASE_STAGE_ALIAS, v, None, None, None)) for i, v in enumerate(versions) ] )
def get(self, request, organization, key): if not TAG_KEY_RE.match(key): return Response( {'detail': 'Invalid tag key format for "%s"' % (key, )}, status=400) try: filter_params = self.get_filter_params(request, organization) except OrganizationEventsError as exc: return Response({'detail': exc.message}, status=400) except NoProjects: paginator = SequencePaginator([]) else: # TODO(jess): update this when snuba tagstore is the primary backend for us tagstore = SnubaTagStorage() paginator = tagstore.get_tag_value_paginator_for_projects( filter_params['project_id'], filter_params.get('environment'), key, filter_params['start'], filter_params['end'], query=request.GET.get('query'), ) return self.paginate( request=request, paginator=paginator, on_results=lambda results: serialize(results, request.user), )
def get_tag_value_paginator_for_projects(self, projects, environments, key, start, end, query=None, order_by='-last_seen'): from sentry.api.paginator import SequencePaginator if not order_by == '-last_seen': raise ValueError("Unsupported order_by: %s" % order_by) snuba_key = snuba.get_snuba_column_name(key) conditions = [] if snuba_key in BLACKLISTED_COLUMNS: snuba_key = 'tags[%s]' % (key, ) if query: conditions.append([snuba_key, 'LIKE', u'%{}%'.format(query)]) else: conditions.append([snuba_key, '!=', '']) filters = { 'project_id': projects, } if environments: filters['environment'] = environments results = snuba.query( start=start, end=end, groupby=[snuba_key], filter_keys=filters, aggregations=[ ['count()', '', 'times_seen'], ['min', 'timestamp', 'first_seen'], ['max', 'timestamp', 'last_seen'], ], conditions=conditions, orderby=order_by, # TODO: This means they can't actually paginate all TagValues. limit=1000, arrayjoin=snuba.get_arrayjoin(snuba_key), referrer='tagstore.get_tag_value_paginator_for_projects', ) tag_values = [ TagValue(key=key, value=value, **fix_tag_value_data(data)) for value, data in six.iteritems(results) ] desc = order_by.startswith('-') score_field = order_by.lstrip('-') return SequencePaginator( [(int(to_timestamp(getattr(tv, score_field)) * 1000), tv) for tv in tag_values], reverse=desc)
def get(self, request: Request, organization, key) -> Response: if not TAG_KEY_RE.match(key): return Response({"detail": f'Invalid tag key format for "{key}"'}, status=400) sentry_sdk.set_tag("query.tag_key", key) try: # still used by events v1 which doesn't require global views filter_params = self.get_snuba_params(request, organization, check_global_views=False) except NoProjects: paginator = SequencePaginator([]) else: with self.handle_query_errors(): paginator = tagstore.get_tag_value_paginator_for_projects( filter_params["project_id"], filter_params.get("environment"), key, filter_params["start"], filter_params["end"], query=request.GET.get("query"), include_transactions=request.GET.get("includeTransactions") == "1", ) return self.paginate( request=request, paginator=paginator, on_results=lambda results: serialize(results, request.user), )
def _get_tag_values_for_releases_across_all_datasets( self, projects, environments, query): from sentry.api.paginator import SequencePaginator organization_id = Project.objects.filter(id=projects[0]).values_list( "organization_id", flat=True)[0] qs = Release.objects.filter(organization_id=organization_id) if projects: qs = qs.filter(id__in=ReleaseProject.objects.filter( project_id__in=projects).values_list("release_id", flat=True)) if environments: qs = qs.filter(id__in=ReleaseEnvironment.objects.filter( environment_id__in=environments).values_list("release_id", flat=True)) if query: qs = qs.filter(version__startswith=query) versions = qs.order_by("version").values_list("version", flat=True)[:1000] return SequencePaginator([(i, TagValue(RELEASE_ALIAS, v, None, None, None)) for i, v in enumerate(versions)])
def _get_tag_values_for_semver_build(self, projects, environments, build): from sentry.api.paginator import SequencePaginator build = build if build else "" if not build.endswith("*"): build += "*" organization_id = Project.objects.filter(id=projects[0]).values_list( "organization_id", flat=True )[0] builds = Release.objects.filter_by_semver_build(organization_id, "exact", build, projects) if environments: builds = builds.filter( id__in=ReleaseEnvironment.objects.filter( environment_id__in=environments ).values_list("release_id", flat=True) ) packages = ( builds.values_list("build_code", flat=True).distinct().order_by("build_code")[:1000] ) return SequencePaginator( [(i, TagValue(SEMVER_BUILD_ALIAS, v, None, None, None)) for i, v in enumerate(packages)] )
def test_ascending_simple(self): paginator = SequencePaginator([(i, i) for i in range(10)], reverse=False) result = paginator.get_result(5) assert list(result) == [0, 1, 2, 3, 4] assert result.prev == Cursor(0, 0, True, False) assert result.next == Cursor(5, 0, False, True) result = paginator.get_result(5, result.next) assert list(result) == [5, 6, 7, 8, 9] assert result.prev == Cursor(5, 0, True, True) assert result.next == Cursor(9, 1, False, False) result = paginator.get_result(5, result.prev) assert list(result) == [0, 1, 2, 3, 4] assert result.prev == Cursor(0, 0, True, False) assert result.next == Cursor(5, 0, False, True) result = paginator.get_result(5, Cursor(100, 0, False)) assert list(result) == [] assert result.prev == Cursor(9, 1, True, True) assert result.next == Cursor(9, 1, False, False)
def test_descending_simple(self): paginator = SequencePaginator([(i, i) for i in range(10)], reverse=True) result = paginator.get_result(5) assert list(result) == [9, 8, 7, 6, 5] assert result.prev == Cursor(9, 0, True, False) assert result.next == Cursor(4, 0, False, True) result = paginator.get_result(5, result.next) assert list(result) == [4, 3, 2, 1, 0] assert result.prev == Cursor(4, 0, True, True) assert result.next == Cursor(0, 1, False, False) result = paginator.get_result(5, result.prev) assert list(result) == [9, 8, 7, 6, 5] assert result.prev == Cursor(9, 0, True, False) assert result.next == Cursor(4, 0, False, True) result = paginator.get_result(5, Cursor(-10, 0, False)) assert list(result) == [] assert result.prev == Cursor(0, 1, True, True) assert result.next == Cursor(0, 1, False, False)
def test_descending_repeated_scores(self): paginator = SequencePaginator([(1, i) for i in range(10)], reverse=True) result = paginator.get_result(5) assert list(result) == [9, 8, 7, 6, 5] assert result.prev == Cursor(1, 0, True, False) assert result.next == Cursor(1, 5, False, True) result = paginator.get_result(5, result.next) assert list(result) == [4, 3, 2, 1, 0] assert result.prev == Cursor(1, 5, True, True) assert result.next == Cursor(1, 10, False, False) result = paginator.get_result(5, result.prev) assert list(result) == [9, 8, 7, 6, 5] assert result.prev == Cursor(1, 0, True, False) assert result.next == Cursor(1, 5, False, True) result = paginator.get_result(5, Cursor(-10, 0, False)) assert list(result) == [] assert result.prev == Cursor(1, 10, True, True) assert result.next == Cursor(1, 10, False, False)
def get_tag_value_paginator(self, project_id, environment_id, key, query=None, order_by='-last_seen'): from sentry.api.paginator import SequencePaginator if not order_by == '-last_seen': raise ValueError("Unsupported order_by: %s" % order_by) conditions = [] if query: conditions.append(['tags_value', 'LIKE', u'%{}%'.format(query)]) start, end = self.get_time_range() filters = { 'project_id': [project_id], 'tags_key': [key], } if environment_id: filters['environment'] = [environment_id] results = snuba.query( start=start, end=end, groupby=['tags_value'], filter_keys=filters, aggregations=[ ['count()', '', 'times_seen'], ['min', 'timestamp', 'first_seen'], ['max', 'timestamp', 'last_seen'], ], conditions=conditions, orderby=order_by, # TODO: This means they can't actually paginate all TagValues. limit=1000, referrer='tagstore.get_tag_value_paginator', ) tag_values = [ TagValue(key=key, value=value, **fix_tag_value_data(data)) for value, data in six.iteritems(results) ] desc = order_by.startswith('-') score_field = order_by.lstrip('-') return SequencePaginator( [(int(to_timestamp(getattr(tv, score_field)) * 1000), tv) for tv in tag_values], reverse=desc)
def test_empty_results(self): paginator = SequencePaginator([]) result = paginator.get_result(5) assert list(result) == [] assert result.prev == Cursor(0, 0, True, False) assert result.next == Cursor(0, 0, False, False) paginator = SequencePaginator([], reverse=True) result = paginator.get_result(5) assert list(result) == [] assert result.prev == Cursor(0, 0, True, False) assert result.next == Cursor(0, 0, False, False)
def _get_tag_values_for_semver_package(self, projects, environments, package): from sentry.api.paginator import SequencePaginator package = package if package else "" organization_id = Project.objects.filter(id=projects[0]).values_list( "organization_id", flat=True)[0] versions = self._get_semver_versions_for_package( projects, organization_id, package) if environments: versions = versions.filter( id__in=ReleaseEnvironment.objects.filter( environment_id__in=environments).values_list("release_id", flat=True)) packages = versions.values_list( "package", flat=True).distinct().order_by("package")[:1000] return SequencePaginator([(i, TagValue(SEMVER_PACKAGE_ALIAS, v, None, None, None)) for i, v in enumerate(packages)])
def test_ascending_simple(self): paginator = SequencePaginator([(i, i) for i in range(10)], reverse=False) result = paginator.get_result(5) assert list(result) == [0, 1, 2, 3, 4] assert result.prev == Cursor(0, 0, True, False) assert result.next == Cursor(5, 0, False, True) result = paginator.get_result(5, result.next) assert list(result) == [5, 6, 7, 8, 9] assert result.prev == Cursor(5, 0, True, True) assert result.next == Cursor(9, 1, False, False) result = paginator.get_result(5, result.prev) assert list(result) == [0, 1, 2, 3, 4] assert result.prev == Cursor(0, 0, True, False) assert result.next == Cursor(5, 0, False, True) result = paginator.get_result(5, Cursor(100, 0, False)) assert list(result) == [] assert result.prev == Cursor(9, 1, True, True) assert result.next == Cursor(9, 1, False, False)
def test_descending_repeated_scores(self): paginator = SequencePaginator([(1, i) for i in range(10)], reverse=True) result = paginator.get_result(5) assert list(result) == [9, 8, 7, 6, 5] assert result.prev == Cursor(1, 0, True, False) assert result.next == Cursor(1, 5, False, True) result = paginator.get_result(5, result.next) assert list(result) == [4, 3, 2, 1, 0] assert result.prev == Cursor(1, 5, True, True) assert result.next == Cursor(1, 10, False, False) result = paginator.get_result(5, result.prev) assert list(result) == [9, 8, 7, 6, 5] assert result.prev == Cursor(1, 0, True, False) assert result.next == Cursor(1, 5, False, True) result = paginator.get_result(5, Cursor(-10, 0, False)) assert list(result) == [] assert result.prev == Cursor(1, 10, True, True) assert result.next == Cursor(1, 10, False, False)
def test_descending_simple(self): paginator = SequencePaginator([(i, i) for i in range(10)], reverse=True) result = paginator.get_result(5) assert list(result) == [9, 8, 7, 6, 5] assert result.prev == Cursor(9, 0, True, False) assert result.next == Cursor(4, 0, False, True) result = paginator.get_result(5, result.next) assert list(result) == [4, 3, 2, 1, 0] assert result.prev == Cursor(4, 0, True, True) assert result.next == Cursor(0, 1, False, False) result = paginator.get_result(5, result.prev) assert list(result) == [9, 8, 7, 6, 5] assert result.prev == Cursor(9, 0, True, False) assert result.next == Cursor(4, 0, False, True) result = paginator.get_result(5, Cursor(-10, 0, False)) assert list(result) == [] assert result.prev == Cursor(0, 1, True, True) assert result.next == Cursor(0, 1, False, False)
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def query( self, projects: Sequence[Project], retention_window_start: Optional[datetime], group_queryset: QuerySet, environments: Sequence[Environment], sort_by: str, limit: int, cursor: Optional[Cursor], count_hits: bool, paginator_options: Mapping[str, Any], search_filters: Sequence[SearchFilter], date_from: Optional[datetime], date_to: Optional[datetime], max_hits=None, ) -> CursorResult: if not validate_cdc_search_filters(search_filters): raise InvalidQueryForExecutor( "Search filters invalid for this query executor") start, end, retention_date = self.calculate_start_end( retention_window_start, search_filters, date_from, date_to) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result e_event = self.entities["event"] e_group = self.entities["group"] where_conditions = [ Condition(Column("project_id", e_event), Op.IN, [p.id for p in projects]), Condition(Column("timestamp", e_event), Op.GTE, start), Condition(Column("timestamp", e_event), Op.LT, end), ] # TODO: This is still basically only handling status, handle this better once we introduce # more conditions. for search_filter in search_filters: where_conditions.append( Condition(Column(search_filter.key.name, e_group), Op.IN, search_filter.value.raw_value)) if environments: # TODO: Should this be handled via filter_keys, once we have a snql compatible version? where_conditions.append( Condition(Column("environment", e_event), Op.IN, [e.name for e in environments])) sort_func = self.aggregation_defs[self.sort_strategies[sort_by]] having = [] if cursor is not None: op = Op.GTE if cursor.is_prev else Op.LTE having.append(Condition(sort_func, op, cursor.value)) query = Query( "events", match=Join([Relationship(e_event, "grouped", e_group)]), select=[ Column("id", e_group), replace(sort_func, alias="score"), ], where=where_conditions, groupby=[Column("id", e_group)], having=having, orderby=[OrderBy(sort_func, direction=Direction.DESC)], limit=Limit(limit + 1), ) data = snuba.raw_snql_query( query, referrer="search.snuba.cdc_search.query")["data"] hits_query = Query( "events", match=Join([Relationship(e_event, "grouped", e_group)]), select=[ Function("uniq", [Column("id", e_group)], alias="count"), ], where=where_conditions, ) hits = None if count_hits: hits = snuba.raw_snql_query( hits_query, referrer="search.snuba.cdc_search.hits")["data"][0]["count"] paginator_results = SequencePaginator( [(row["score"], row["g.id"]) for row in data], reverse=True, **paginator_options, ).get_result(limit, cursor, known_hits=hits, max_hits=max_hits) # We filter against `group_queryset` here so that we recheck all conditions in Postgres. # Since replay between Postgres and Clickhouse can happen, we might get back results that # have changed state in Postgres. By rechecking them we guarantee than any returned results # have the correct state. # TODO: This can result in us returning less than a full page of results, but shouldn't # affect cursors. If we want to, we can iterate and query snuba until we manage to get a # full page. In practice, this will likely only skip a couple of results at worst, and # probably not be noticeable to the user, so holding off for now to reduce complexity. groups = group_queryset.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, max_hits=None, ): now = timezone.now() end = None end_params = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA metrics.incr("snuba.search.postgres_only") # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in self.postgres_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( _f for _f in [retention_window_start, now - timedelta(days=90)] if _f) start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max(_f for _f in start_params if _f) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") with sentry_sdk.start_span(op="snuba_group_query") as span: group_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) span.set_data("Max Candidates", max_candidates) span.set_data("Result Size", len(group_ids)) metrics.timing("snuba.search.num_candidates", len(group_ids)) too_many_candidates = False if not group_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return self.empty_result elif len(group_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'status', # 'bookmarked_by', 'assigned_to', 'unassigned', or 'subscribed_by') # then it might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True group_ids = [] sort_field = self.sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = self.calculate_hits( group_ids, too_many_candidates, sort_field, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, start, end, ) if count_hits and hits == 0: return self.empty_result paginator_results = self.empty_result result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have group_ids always query for at least that many items chunk_limit = max(chunk_limit, len(group_ids)) # {group_id: group_score, ...} snuba_groups, total = self.snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], organization_id=projects[0].organization_id, sort_field=sort_field, cursor=cursor, group_ids=group_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if group_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the group_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits, max_hits=max_hits) if group_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def test_hits(self): n = 10 paginator = SequencePaginator([(i, i) for i in range(n)]) assert paginator.get_result(5, count_hits=True).hits == n
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): from sentry.models import (Group, Environment, Event, GroupEnvironment, Release) # this backend only supports search within one project/environment if len(projects) != 1 or (environments is not None and len(environments) > 1): raise NotImplementedError project = projects[0] environment = environments[0] if environments is not None else environments if environment is not None: if 'environment' in tags: environment_name = tags.pop('environment') assert environment_name is ANY or Environment.objects.get( projects=project, name=environment_name, ).id == environment.id event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('date_added', 'gt'), 'date_to': ScalarCondition('date_added', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): event_queryset = event_queryset_builder.build( tagstore.get_event_tag_qs( project_id=project.id, environment_id=environment.id, key='environment', value=environment.name, ), parameters, ) if retention_window_start is not None: event_queryset = event_queryset.filter(date_added__gte=retention_window_start) group_queryset = group_queryset.filter( id__in=list(event_queryset.distinct().values_list('group_id', flat=True)[:1000]) ) _, group_queryset_sort_clause = sort_strategies[sort_by] group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( get_sql_column(GroupEnvironment, 'first_release_id'), get_sql_column(Release, 'id'), ), '{} = %s'.format( get_sql_column(Release, 'organization'), ), '{} = %s'.format( get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), 'times_seen': CallbackCondition( # This condition represents the exact number of times that # an issue has been seen in an environment. Since an issue # can't be seen in an environment more times than the issue # was seen overall, we can safely exclude any groups that # don't have at least that many events. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), 'times_seen_lower': CallbackCondition( # This condition represents the lower threshold for the # number of times an issue has been seen in an environment. # Since an issue can't be seen in an environment more times # than the issue was seen overall, we can safely exclude # any groups that haven't met that threshold. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), # The following conditions make a few assertions that are are # correct in an abstract sense but may not accurately reflect # the existing implementation (see GH-5289). These assumptions # are that 1. The first seen time for a Group is the minimum # value of the first seen time for all of it's GroupEnvironment # relations; 2. The last seen time for a Group is the maximum # value of the last seen time for all of it's GroupEnvironment # relations; 3. The first seen time is always less than or # equal to the last seen time. 'age_from': CallbackCondition( # This condition represents the lower threshold for "first # seen" time for an environment. Due to assertions #1 and # #3, we can exclude any groups where the "last seen" time # is prior to this timestamp. lambda queryset, first_seen: queryset.exclude( last_seen__lt=first_seen, ), ), 'age_to': CallbackCondition( # This condition represents the upper threshold for "first # seen" time for an environment. Due to assertions #1, we # can exclude any values where the group first seen is # greater than that threshold. lambda queryset, first_seen: queryset.exclude( first_seen__gt=first_seen, ), ), 'last_seen_from': CallbackCondition( # This condition represents the lower threshold for "last # seen" time for an environment. Due to assertion #2, we # can exclude any values where the group last seen value is # less than that threshold. lambda queryset, last_seen: queryset.exclude( last_seen__lt=last_seen, ), ), 'last_seen_to': CallbackCondition( # This condition represents the upper threshold for "last # seen" time for an environment. Due to assertions #2 and # #3, we can exclude any values where the group first seen # value is greater than that threshold. lambda queryset, last_seen: queryset.exclude( first_seen__gt=last_seen, ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( get_sql_column(Group, 'id'), get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ).order_by(group_queryset_sort_clause) get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[sort_by] group_tag_value_queryset = tagstore.get_group_tag_value_qs( project_id=project.id, group_id=set(group_queryset.values_list('id', flat=True)[:10000]), environment_id=environment.id, key='environment', value=environment.name, ) if retention_window_start is not None: group_tag_value_queryset = group_tag_value_queryset.filter( last_seen__gte=retention_window_start ) candidates = dict( QuerySetBuilder({ 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_tag_value_queryset, parameters, ).extra( select={ 'sort_value': get_sort_expression(group_tag_value_queryset.model), }, ).values_list('group_id', 'sort_value') ) if tags: # TODO: `get_group_ids_for_search_filter` should be able to # utilize the retention window start parameter for additional # optimizations. matches = tagstore.get_group_ids_for_search_filter( project_id=project.id, environment_id=environment.id, tags=tags, candidates=candidates.keys(), limit=len(candidates), ) for key in set(candidates) - set(matches or []): del candidates[key] result = SequencePaginator( [(sort_value_to_cursor_value(score), id) for (id, score) in candidates.items()], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(result.results) result.results = [groups[k] for k in result.results if k in groups] return result else: event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('datetime', 'gt'), 'date_to': ScalarCondition('datetime', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): group_queryset = group_queryset.filter( id__in=list( event_queryset_builder.build( Event.objects.filter(project_id=project.id), parameters, ).distinct().values_list('group_id', flat=True)[:1000], ) ) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_queryset, parameters, ).extra( select={ 'sort_value': get_sort_clause(sort_by), }, ) if tags: group_ids = tagstore.get_group_ids_for_search_filter( project_id=project.id, environment_id=None, tags=tags, candidates=None, ) if group_ids: group_queryset = group_queryset.filter(id__in=group_ids) else: group_queryset = group_queryset.none() paginator_cls, sort_clause = sort_strategies[sort_by] group_queryset = group_queryset.order_by(sort_clause) paginator = paginator_cls(group_queryset, sort_clause, **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits)
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) return paginator.get_result(limit, cursor, count_hits=False) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # num_candidates is the number of Group IDs to send down to Snuba, if # more Group ID candidates are found, a "bare" Snuba search is performed # and the result groups are then post-filtered via queries to the Sentry DB optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer') if optimizer_enabled: missed_projects = [] keys = [self._get_project_count_cache_key(p.id) for p in projects] counts_by_projects = { self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items() } missed_projects = {p.id for p in projects} - set(counts_by_projects.keys()) if missed_projects: missing_counts = snuba.query( start=max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ), end=now, groupby=['project_id'], filter_keys={ 'project_id': list(missed_projects), }, aggregations=[['uniq', 'group_id', 'group_count']], referrer='search', ) cache.set_many({ self._get_project_count_cache_key(project_id): count for project_id, count in missing_counts.items() }, options.get('snuba.search.project-group-count-cache-time')) counts_by_projects.update(missing_counts) min_candidates = options.get('snuba.search.min-pre-snuba-candidates') max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage') num_candidates = max( min_candidates, min( max_candidates, sum(counts_by_projects.values()) * candidates_percentage ) ) else: num_candidates = options.get('snuba.search.min-pre-snuba-candidates') # pre-filter query candidate_ids = None if num_candidates and limit <= num_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:num_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > num_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `num_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # TODO: If the query didn't include anything to significantly filter # down the number of groups at this point ('first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then this # queryset might return a *huge* number of groups. In this case, we # probably *don't* want to pass candidates down to Snuba, and rather we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba. # # However, if this did filter down the number of groups significantly, # then passing in candidates is, of course, valuable. # # Should we decide which way to handle it based on the number of # group_ids, the number of hashes? Or should we just always start the # query with Snuba? Something else? candidate_group_ids = list(group_queryset.values_list('id', flat=True)) sort, extra_aggregations, calculate_cursor_for_group = sort_strategies[sort_by] group_data = do_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, candidates=candidate_group_ids, **parameters ) group_to_score = {} for group_id, data in group_data.items(): group_to_score[group_id] = calculate_cursor_for_group(data) paginator_results = SequencePaginator( [(score, id) for (id, score) in group_to_score.items()], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the past, but apparently # `retention_window_start` can be None(?), so we need a fallback. start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # maximum number of Group IDs to send down to Snuba, # if more Group ID candidates are found, a "bare" Snuba # search is performed and the result groups are then # post-filtered via queries to the Sentry DB max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates') # pre-filter query candidate_ids = None if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_ids) > max_pre_snuba_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_pre_snuba_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_ids = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = Paginator(Group.objects.none()).get_result() result_groups = [] result_group_ids = set() min_score = float('inf') max_score = -1 max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # used for cursor logic min_score = min(min_score, group_score) max_score = max(max_score, group_score) # HACK: If a cursor is being used and there may be more results available # in Snuba, we need to detect whether the cursor's value will be # found in the result groups. If it isn't in the results yet we need to # continue querying before we hand off to the paginator to decide whether # enough results are found or not, otherwise the paginator will happily # return `limit` worth of results that don't take the cursor into account # at all, since it can't know there are more results available. # TODO: If chunked search works in practice we should probably extend the # paginator to throw something if the cursor value is never found, or do # something other than partially leak internal paginator logic up to here. # Or make separate Paginator implementation just for Snuba search? if cursor is not None \ and not candidate_ids \ and more_results: if cursor.is_prev and min_score < cursor.value: continue elif not cursor.is_prev and max_score > cursor.value: continue paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _get_tag_values_for_semver( self, projects: Sequence[int], environments: Optional[Sequence[str]], query: Optional[str], ): from sentry.api.paginator import SequencePaginator query = query if query else "" organization_id = Project.objects.filter(id=projects[0]).values_list( "organization_id", flat=True )[0] if query and "@" not in query and re.search(r"[^\d.\*]", query): # Handle searching just on package include_package = True versions = self._get_semver_versions_for_package(projects, organization_id, query) else: include_package = "@" in query query = query.replace("*", "") if "@" in query: versions = Release.objects.filter(version__startswith=query) else: versions = Release.objects.filter(version__contains="@" + query) if projects: versions = versions.filter( id__in=ReleaseProject.objects.filter(project_id__in=projects).values_list( "release_id", flat=True ) ) if environments: versions = versions.filter( id__in=ReleaseEnvironment.objects.filter( environment_id__in=environments ).values_list("release_id", flat=True) ) order_by = map(_flip_field_sort, Release.SEMVER_COLS + ["package"]) versions = ( versions.filter_to_semver() .annotate_prerelease_column() .order_by(*order_by) .values_list("version", flat=True)[:1000] ) seen = set() formatted_versions = [] # We want to format versions here in a way that makes sense for autocomplete. So we # - Only include package if we think the user entered a package # - Exclude build number, since it's not used as part of filtering # When we don't include package, this can result in duplicate version numbers, so we # also de-dupe here. This can result in less than 1000 versions returned, but we # typically use very few values so this works ok. for version in versions: formatted_version = version if include_package else version.split("@", 1)[1] formatted_version = formatted_version.split("+", 1)[0] if formatted_version in seen: continue seen.add(formatted_version) formatted_versions.append(formatted_version) return SequencePaginator( [ (i, TagValue(SEMVER_ALIAS, v, None, None, None)) for i, v in enumerate(formatted_versions) ] )
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # TODO: If the query didn't include anything to significantly filter # down the number of groups at this point ('first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then this # queryset might return a *huge* number of groups. In this case, we # probably *don't* want to pass candidates down to Snuba, and rather we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba. # # However, if this did filter down the number of groups significantly, # then passing in candidates is, of course, valuable. # # Should we decide which way to handle it based on the number of # group_ids, the number of hashes? Or should we just always start the # query with Snuba? Something else? candidate_group_ids = list(group_queryset.values_list('id', flat=True)) sort, extra_aggregations, calculate_cursor_for_group = sort_strategies[sort_by] group_data = do_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, candidates=candidate_group_ids, **parameters ) group_to_score = {} for group_id, data in group_data.items(): group_to_score[group_id] = calculate_cursor_for_group(data) paginator_results = SequencePaginator( [(score, id) for (id, score) in group_to_score.items()], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to): # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: environment_ids = [environment.id for environment in environments] group_queryset = group_queryset.filter( groupenvironment__environment_id__in=environment_ids ) group_queryset = QuerySetBuilder({ 'first_release': QCallbackCondition( lambda version: Q( groupenvironment__first_release__organization_id=projects[0].organization_id, groupenvironment__first_release__version=version, groupenvironment__environment_id__in=environment_ids, ) ), 'first_seen': ScalarCondition( 'groupenvironment__first_seen', {'groupenvironment__environment_id__in': environment_ids} ), }).build(group_queryset, search_filters) else: group_queryset = QuerySetBuilder({ 'first_release': QCallbackCondition( lambda version: Q( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), 'first_seen': ScalarCondition('first_seen'), }).build(group_queryset, search_filters) now = timezone.now() end = None end_params = filter( None, [date_to, get_search_filter(search_filters, 'date', '<')], ) if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if ( cursor is None and sort_by == 'date' and not environments and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in issue_only_fields.union(['date', 'message']) ] ): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) # TODO: We should try and consolidate all this logic together a little # better, maybe outside the backend. Should be easier once we're on # just the new search filters start_params = [ date_from, retention_date, get_search_filter(search_filters, 'date', '>'), ] start = max(filter(None, start_params)) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get('snuba.search.max-pre-snuba-candidates') too_many_candidates = False candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) too_many_candidates = True candidate_ids = [] sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() if count_hits and (too_many_candidates or cursor is not None): # If we had too many candidates to reasonably pass down to snuba, # or if we have a cursor that bisects the overall result set (such # that our query only sees results on one side of the cursor) then # we need an alternative way to figure out the total hits that this # query has. # To do this, we get a sample of groups matching the snuba side of # the query, and see how many of those pass the post-filter in # postgres. This should give us an estimate of the total number of # snuba matches that will be overall matches, which we can use to # get an estimate for X-Hits. # The sampling is not simple random sampling. It will return *all* # matching groups if there are less than N groups matching the # query, or it will return a random, deterministic subset of N of # the groups if there are more than N overall matches. This means # that the "estimate" is actually an accurate result when there are # less than N matching groups. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get('snuba.search.hits-sample-size') snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, search_filters=search_filters, ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids)) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def query(self, project, tags=None, environment=None, sort_by='date', limit=100, cursor=None, count_hits=False, paginator_options=None, **parameters): from sentry.models import (Environment, Event, Group, GroupEnvironment, GroupStatus, GroupSubscription, Release) if paginator_options is None: paginator_options = {} if tags is None: tags = {} try: if tags.get('sentry:release') == 'latest': tags['sentry:release'] = get_latest_release( project, environment) if parameters.get('first_release') == 'latest': parameters['first_release'] = get_latest_release( project, environment) except Release.DoesNotExist: # no matches could possibly be found from this point on return Paginator(Group.objects.none()).get_result() group_queryset = QuerySetBuilder({ 'query': CallbackCondition( lambda queryset, query: queryset.filter( Q(message__icontains=query) | Q(culprit__icontains=query), ) if query else queryset, ), 'status': CallbackCondition( lambda queryset, status: queryset.filter(status=status), ), 'bookmarked_by': CallbackCondition( lambda queryset, user: queryset.filter( bookmark_set__project=project, bookmark_set__user=user, ), ), 'assigned_to': CallbackCondition( functools.partial(assigned_to_filter, project=project), ), 'unassigned': CallbackCondition( lambda queryset, unassigned: queryset.filter( assignee_set__isnull=unassigned, ), ), 'subscribed_by': CallbackCondition( lambda queryset, user: queryset.filter( id__in=GroupSubscription.objects.filter( project=project, user=user, is_active=True, ).values_list('group'), ), ), 'active_at_from': ScalarCondition('active_at', 'gt'), 'active_at_to': ScalarCondition('active_at', 'lt'), }).build( Group.objects.filter(project=project).exclude(status__in=[ GroupStatus.PENDING_DELETION, GroupStatus.DELETION_IN_PROGRESS, GroupStatus.PENDING_MERGE, ]), parameters, ) # filter out groups which are beyond the retention period retention = quotas.get_event_retention( organization=project.organization) if retention: retention_window_start = timezone.now() - timedelta(days=retention) # TODO: This could be optimized when building querysets to identify # criteria that are logically impossible (e.g. if the upper bound # for last seen is before the retention window starts, no results # exist.) group_queryset = group_queryset.filter( last_seen__gte=retention_window_start) else: retention_window_start = None if environment is not None: if 'environment' in tags: # TODO: This should probably just overwrite the existing tag, # rather than asserting on it, but...? assert Environment.objects.get( projects=project, name=tags.pop('environment'), ).id == environment.id event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('date_added', 'gt'), 'date_to': ScalarCondition('date_added', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): event_queryset = event_queryset_builder.build( tagstore.get_event_tag_qs( project.id, environment.id, 'environment', environment.name, ), parameters, ) if retention_window_start is not None: event_queryset = event_queryset.filter( date_added__gte=retention_window_start) group_queryset = group_queryset.filter( id__in=list(event_queryset.distinct().values_list( 'group_id', flat=True)[:1000])) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( get_sql_column(GroupEnvironment, 'first_release_id'), get_sql_column(Release, 'id'), ), '{} = %s'.format( get_sql_column(Release, 'organization'), ), '{} = %s'.format( get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), 'times_seen': CallbackCondition( # This condition represents the exact number of times that # an issue has been seen in an environment. Since an issue # can't be seen in an environment more times than the issue # was seen overall, we can safely exclude any groups that # don't have at least that many events. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), 'times_seen_lower': CallbackCondition( # This condition represents the lower threshold for the # number of times an issue has been seen in an environment. # Since an issue can't be seen in an environment more times # than the issue was seen overall, we can safely exclude # any groups that haven't met that threshold. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), # The following conditions make a few assertions that are are # correct in an abstract sense but may not accurately reflect # the existing implementation (see GH-5289). These assumptions # are that 1. The first seen time for a Group is the minimum # value of the first seen time for all of it's GroupEnvironment # relations; 2. The last seen time for a Group is the maximum # value of the last seen time for all of it's GroupEnvironment # relations; 3. The first seen time is always less than or # equal to the last seen time. 'age_from': CallbackCondition( # This condition represents the lower threshold for "first # seen" time for an environment. Due to assertions #1 and # #3, we can exclude any groups where the "last seen" time # is prior to this timestamp. lambda queryset, first_seen: queryset.exclude( last_seen__lt=first_seen, ), ), 'age_to': CallbackCondition( # This condition represents the upper threshold for "first # seen" time for an environment. Due to assertions #1, we # can exclude any values where the group first seen is # greater than that threshold. lambda queryset, first_seen: queryset.exclude( first_seen__gt=first_seen, ), ), 'last_seen_from': CallbackCondition( # This condition represents the lower threshold for "last # seen" time for an environment. Due to assertion #2, we # can exclude any values where the group last seen value is # less than that threshold. lambda queryset, last_seen: queryset.exclude(last_seen__lt= last_seen, ), ), 'last_seen_to': CallbackCondition( # This condition represents the upper threshold for "last # seen" time for an environment. Due to assertions #2 and # #3, we can exclude any values where the group first seen # value is greater than that threshold. lambda queryset, last_seen: queryset.exclude(first_seen__gt =last_seen, ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( get_sql_column(Group, 'id'), get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[ sort_by] group_tag_value_queryset = tagstore.get_group_tag_value_qs( project.id, set(group_queryset.values_list('id', flat=True)), # TODO: Limit?, environment.id, 'environment', environment.name, ) if retention_window_start is not None: group_tag_value_queryset = group_tag_value_queryset.filter( last_seen__gte=retention_window_start) candidates = dict( QuerySetBuilder({ 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter( times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_tag_value_queryset, parameters, ).extra(select={ 'sort_value': get_sort_expression(group_tag_value_queryset.model), }, ).values_list('group_id', 'sort_value')) if tags: # TODO: `get_group_ids_for_search_filter` should be able to # utilize the retention window start parameter for additional # optimizations. matches = tagstore.get_group_ids_for_search_filter( project.id, environment.id, tags, candidates.keys(), limit=len(candidates), ) for key in set(candidates) - set(matches or []): del candidates[key] result = SequencePaginator([(sort_value_to_cursor_value(score), id) for (id, score) in candidates.items()], reverse=True, **paginator_options).get_result( limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(result.results) result.results = [groups[k] for k in result.results if k in groups] return result else: event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('datetime', 'gt'), 'date_to': ScalarCondition('datetime', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): group_queryset = group_queryset.filter(id__in=list( event_queryset_builder.build( Event.objects.filter(project_id=project.id), parameters, ).distinct().values_list('group_id', flat=True)[:1000], )) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen= times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_queryset, parameters, ).extra(select={ 'sort_value': get_sort_clause(sort_by), }, ) if tags: matches = tagstore.get_group_ids_for_search_filter( project.id, None, tags) if matches: group_queryset = group_queryset.filter(id__in=matches) else: group_queryset = group_queryset.none() paginator_cls, sort_clause = sort_strategies[sort_by] group_queryset = group_queryset.order_by(sort_clause) paginator = paginator_cls(group_queryset, sort_clause, **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits)
def get_tag_value_paginator_for_projects(self, projects, environments, key, start=None, end=None, query=None, order_by="-last_seen"): from sentry.api.paginator import SequencePaginator if not order_by == "-last_seen": raise ValueError("Unsupported order_by: %s" % order_by) dataset = Dataset.Events snuba_key = snuba.get_snuba_column_name(key) if snuba_key.startswith("tags["): snuba_key = snuba.get_snuba_column_name(key, dataset=Dataset.Discover) if not snuba_key.startswith("tags["): dataset = Dataset.Discover conditions = [] # transaction status needs a special case so that the user interacts with the names and not codes transaction_status = snuba_key == "transaction_status" if transaction_status: conditions.append([ snuba_key, "IN", # Here we want to use the status codes during filtering, # but want to do this with names that include our query [ span_key for span_key, value in six.iteritems( SPAN_STATUS_CODE_TO_NAME) if (query and query in value) or (not query) ], ]) elif key in FUZZY_NUMERIC_KEYS: converted_query = int( query) if query is not None and query.isdigit() else None if converted_query is not None: conditions.append([ snuba_key, ">=", converted_query - FUZZY_NUMERIC_DISTANCE ]) conditions.append([ snuba_key, "<=", converted_query + FUZZY_NUMERIC_DISTANCE ]) elif key == PROJECT_ALIAS: project_filters = { "id__in": projects, } if query: project_filters["slug__icontains"] = query project_queryset = Project.objects.filter( **project_filters).values("id", "slug") project_slugs = { project["id"]: project["slug"] for project in project_queryset } if project_queryset.exists(): projects = [project["id"] for project in project_queryset] snuba_key = "project_id" dataset = Dataset.Discover else: if snuba_key in BLACKLISTED_COLUMNS: snuba_key = "tags[%s]" % (key, ) if query: conditions.append([snuba_key, "LIKE", u"%{}%".format(query)]) else: conditions.append([snuba_key, "!=", ""]) filters = {"project_id": projects} if environments: filters["environment"] = environments results = snuba.query( dataset=dataset, start=start, end=end, groupby=[snuba_key], filter_keys=filters, aggregations=[ ["count()", "", "times_seen"], ["min", "timestamp", "first_seen"], ["max", "timestamp", "last_seen"], ], conditions=conditions, orderby=order_by, # TODO: This means they can't actually paginate all TagValues. limit=1000, arrayjoin=snuba.get_arrayjoin(snuba_key), referrer="tagstore.get_tag_value_paginator_for_projects", ) # With transaction_status we need to map the ids back to their names if transaction_status: results = OrderedDict([ (SPAN_STATUS_CODE_TO_NAME[result_key], data) for result_key, data in six.iteritems(results) ]) # With project names we map the ids back to the project slugs elif key == PROJECT_ALIAS: results = OrderedDict([(project_slugs[value], data) for value, data in six.iteritems(results)]) tag_values = [ TagValue(key=key, value=six.text_type(value), **fix_tag_value_data(data)) for value, data in six.iteritems(results) ] desc = order_by.startswith("-") score_field = order_by.lstrip("-") return SequencePaginator( [(int(to_timestamp(getattr(tv, score_field)) * 1000), tv) for tv in tag_values], reverse=desc, )
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): from sentry.models import (Group, Environment, Event, GroupEnvironment, Release) if environment is not None: if 'environment' in tags: environment_name = tags.pop('environment') assert environment_name is ANY or Environment.objects.get( projects=project, name=environment_name, ).id == environment.id event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('date_added', 'gt'), 'date_to': ScalarCondition('date_added', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): event_queryset = event_queryset_builder.build( tagstore.get_event_tag_qs( project_id=project.id, environment_id=environment.id, key='environment', value=environment.name, ), parameters, ) if retention_window_start is not None: event_queryset = event_queryset.filter( date_added__gte=retention_window_start) group_queryset = group_queryset.filter( id__in=list(event_queryset.distinct().values_list( 'group_id', flat=True)[:1000])) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( get_sql_column(GroupEnvironment, 'first_release_id'), get_sql_column(Release, 'id'), ), '{} = %s'.format( get_sql_column(Release, 'organization'), ), '{} = %s'.format( get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), 'times_seen': CallbackCondition( # This condition represents the exact number of times that # an issue has been seen in an environment. Since an issue # can't be seen in an environment more times than the issue # was seen overall, we can safely exclude any groups that # don't have at least that many events. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), 'times_seen_lower': CallbackCondition( # This condition represents the lower threshold for the # number of times an issue has been seen in an environment. # Since an issue can't be seen in an environment more times # than the issue was seen overall, we can safely exclude # any groups that haven't met that threshold. lambda queryset, times_seen: queryset.exclude( times_seen__lt=times_seen, ), ), # The following conditions make a few assertions that are are # correct in an abstract sense but may not accurately reflect # the existing implementation (see GH-5289). These assumptions # are that 1. The first seen time for a Group is the minimum # value of the first seen time for all of it's GroupEnvironment # relations; 2. The last seen time for a Group is the maximum # value of the last seen time for all of it's GroupEnvironment # relations; 3. The first seen time is always less than or # equal to the last seen time. 'age_from': CallbackCondition( # This condition represents the lower threshold for "first # seen" time for an environment. Due to assertions #1 and # #3, we can exclude any groups where the "last seen" time # is prior to this timestamp. lambda queryset, first_seen: queryset.exclude( last_seen__lt=first_seen, ), ), 'age_to': CallbackCondition( # This condition represents the upper threshold for "first # seen" time for an environment. Due to assertions #1, we # can exclude any values where the group first seen is # greater than that threshold. lambda queryset, first_seen: queryset.exclude( first_seen__gt=first_seen, ), ), 'last_seen_from': CallbackCondition( # This condition represents the lower threshold for "last # seen" time for an environment. Due to assertion #2, we # can exclude any values where the group last seen value is # less than that threshold. lambda queryset, last_seen: queryset.exclude(last_seen__lt= last_seen, ), ), 'last_seen_to': CallbackCondition( # This condition represents the upper threshold for "last # seen" time for an environment. Due to assertions #2 and # #3, we can exclude any values where the group first seen # value is greater than that threshold. lambda queryset, last_seen: queryset.exclude(first_seen__gt =last_seen, ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( get_sql_column(Group, 'id'), get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[ sort_by] group_tag_value_queryset = tagstore.get_group_tag_value_qs( project_id=project.id, group_id=set( group_queryset.values_list('id', flat=True)[:10000]), environment_id=environment.id, key='environment', value=environment.name, ) if retention_window_start is not None: group_tag_value_queryset = group_tag_value_queryset.filter( last_seen__gte=retention_window_start) candidates = dict( QuerySetBuilder({ 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter( times_seen=times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_tag_value_queryset, parameters, ).extra(select={ 'sort_value': get_sort_expression(group_tag_value_queryset.model), }, ).values_list('group_id', 'sort_value')) if tags: # TODO: `get_group_ids_for_search_filter` should be able to # utilize the retention window start parameter for additional # optimizations. matches = tagstore.get_group_ids_for_search_filter( project_id=project.id, environment_id=environment.id, tags=tags, candidates=candidates.keys(), limit=len(candidates), ) for key in set(candidates) - set(matches or []): del candidates[key] result = SequencePaginator([(sort_value_to_cursor_value(score), id) for (id, score) in candidates.items()], reverse=True, **paginator_options).get_result( limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(result.results) result.results = [groups[k] for k in result.results if k in groups] return result else: event_queryset_builder = QuerySetBuilder({ 'date_from': ScalarCondition('datetime', 'gt'), 'date_to': ScalarCondition('datetime', 'lt'), }) if any(key in parameters for key in event_queryset_builder.conditions.keys()): group_queryset = group_queryset.filter(id__in=list( event_queryset_builder.build( Event.objects.filter(project_id=project.id), parameters, ).distinct().values_list('group_id', flat=True)[:1000], )) group_queryset = QuerySetBuilder({ 'first_release': CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), 'age_from': ScalarCondition('first_seen', 'gt'), 'age_to': ScalarCondition('first_seen', 'lt'), 'last_seen_from': ScalarCondition('last_seen', 'gt'), 'last_seen_to': ScalarCondition('last_seen', 'lt'), 'times_seen': CallbackCondition( lambda queryset, times_seen: queryset.filter(times_seen= times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', 'gt'), 'times_seen_upper': ScalarCondition('times_seen', 'lt'), }).build( group_queryset, parameters, ).extra(select={ 'sort_value': get_sort_clause(sort_by), }, ) if tags: group_ids = tagstore.get_group_ids_for_search_filter( project_id=project.id, environment_id=None, tags=tags, candidates=None, ) if group_ids: group_queryset = group_queryset.filter(id__in=group_ids) else: group_queryset = group_queryset.none() paginator_cls, sort_clause = sort_strategies[sort_by] group_queryset = group_queryset.order_by(sort_clause) paginator = paginator_cls(group_queryset, sort_clause, **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits)
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, ): now = timezone.now() end = None end_params = filter( None, [date_to, get_search_filter(search_filters, "date", "<")]) if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and not environments and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in issue_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [retention_window_start, now - timedelta(days=90)])) # TODO: We should try and consolidate all this logic together a little # better, maybe outside the backend. Should be easier once we're on # just the new search filters start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max(filter(None, start_params)) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") too_many_candidates = False candidate_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) metrics.timing("snuba.search.num_candidates", len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True candidate_ids = [] sort_field = sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() if count_hits and (too_many_candidates or cursor is not None): # If we had too many candidates to reasonably pass down to snuba, # or if we have a cursor that bisects the overall result set (such # that our query only sees results on one side of the cursor) then # we need an alternative way to figure out the total hits that this # query has. # To do this, we get a sample of groups matching the snuba side of # the query, and see how many of those pass the post-filter in # postgres. This should give us an estimate of the total number of # snuba matches that will be overall matches, which we can use to # get an estimate for X-Hits. # The sampling is not simple random sampling. It will return *all* # matching groups if there are less than N groups matching the # query, or it will return a random, deterministic subset of N of # the groups if there are more than N overall matches. This means # that the "estimate" is actually an accurate result when there are # less than N matching groups. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get("snuba.search.hits-sample-size") snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, search_filters=search_filters, ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids)) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def get_tag_value_paginator_for_projects( self, projects, environments, key, start=None, end=None, query=None, order_by="-last_seen", include_transactions=False, ): from sentry.api.paginator import SequencePaginator if not order_by == "-last_seen": raise ValueError("Unsupported order_by: %s" % order_by) dataset = Dataset.Events snuba_key = snuba.get_snuba_column_name(key) if include_transactions and snuba_key.startswith("tags["): snuba_key = snuba.get_snuba_column_name(key, dataset=Dataset.Discover) if not snuba_key.startswith("tags["): dataset = Dataset.Discover # We cannot search the values of these columns like we do other columns because they are # a different type, and as such, LIKE and != do not work on them. Furthermore, because the # use case for these values in autosuggestion is minimal, so we choose to disable them here. # # event_id: This is a FixedString which disallows us to use LIKE on it when searching, # but does work with !=. However, for consistency sake we disallow it # entirely, furthermore, suggesting an event_id is not a very useful feature # as they are not human readable. # timestamp: This is a DateTime which disallows us to use both LIKE and != on it when # searching. Suggesting a timestamp can potentially be useful but as it does # work at all, we opt to disable it here. A potential solution can be to # generate a time range to bound where they are searching. e.g. if a user # enters 2020-07 we can generate the following conditions: # >= 2020-07-01T00:00:00 AND <= 2020-07-31T23:59:59 # time: This is a column computed from timestamp so it suffers the same issues if snuba_key in {"event_id", "timestamp", "time"}: return SequencePaginator([]) # These columns have fixed values and we don't need to emit queries to find out the # potential options. if key in {"error.handled", "error.unhandled"}: return SequencePaginator( [ ( 1, TagValue( key=key, value="true", times_seen=None, first_seen=None, last_seen=None ), ), ( 2, TagValue( key=key, value="false", times_seen=None, first_seen=None, last_seen=None ), ), ] ) conditions = [] # transaction status needs a special case so that the user interacts with the names and not codes transaction_status = snuba_key == "transaction_status" if include_transactions and transaction_status: # Here we want to use the status codes during filtering, # but want to do this with names that include our query status_codes = [ span_key for span_key, value in six.iteritems(SPAN_STATUS_CODE_TO_NAME) if (query and query in value) or (not query) ] if status_codes: conditions.append([snuba_key, "IN", status_codes]) else: return SequencePaginator([]) elif key in FUZZY_NUMERIC_KEYS: converted_query = int(query) if query is not None and query.isdigit() else None if converted_query is not None: conditions.append([snuba_key, ">=", converted_query - FUZZY_NUMERIC_DISTANCE]) conditions.append([snuba_key, "<=", converted_query + FUZZY_NUMERIC_DISTANCE]) elif include_transactions and key == PROJECT_ALIAS: project_filters = { "id__in": projects, } if query: project_filters["slug__icontains"] = query project_queryset = Project.objects.filter(**project_filters).values("id", "slug") if not project_queryset.exists(): return SequencePaginator([]) project_slugs = {project["id"]: project["slug"] for project in project_queryset} projects = [project["id"] for project in project_queryset] snuba_key = "project_id" dataset = Dataset.Discover else: snuba_name = snuba_key is_user_alias = include_transactions and key == USER_DISPLAY_ALIAS if is_user_alias: # user.alias is a pseudo column in discover. It is computed by coalescing # together multiple user attributes. Here we get the coalese function used, # and resolve it to the corresponding snuba query dataset = Dataset.Discover resolver = snuba.resolve_column(dataset) snuba_name = FIELD_ALIASES[USER_DISPLAY_ALIAS].get_field() snuba.resolve_complex_column(snuba_name, resolver) elif snuba_name in BLACKLISTED_COLUMNS: snuba_name = "tags[%s]" % (key,) if query: conditions.append([snuba_name, "LIKE", u"%{}%".format(query)]) else: conditions.append([snuba_name, "!=", ""]) filters = {"project_id": projects} if environments: filters["environment"] = environments results = snuba.query( dataset=dataset, start=start, end=end, groupby=[snuba_key], filter_keys=filters, aggregations=[ ["count()", "", "times_seen"], ["min", "timestamp", "first_seen"], ["max", "timestamp", "last_seen"], ], conditions=conditions, orderby=order_by, # TODO: This means they can't actually paginate all TagValues. limit=1000, arrayjoin=snuba.get_arrayjoin(snuba_key), referrer="tagstore.get_tag_value_paginator_for_projects", ) if include_transactions: # With transaction_status we need to map the ids back to their names if transaction_status: results = OrderedDict( [ (SPAN_STATUS_CODE_TO_NAME[result_key], data) for result_key, data in six.iteritems(results) ] ) # With project names we map the ids back to the project slugs elif key == PROJECT_ALIAS: results = OrderedDict( [ (project_slugs[value], data) for value, data in six.iteritems(results) if value in project_slugs ] ) tag_values = [ TagValue(key=key, value=six.text_type(value), **fix_tag_value_data(data)) for value, data in six.iteritems(results) ] desc = order_by.startswith("-") score_field = order_by.lstrip("-") return SequencePaginator( [(int(to_timestamp(getattr(tv, score_field)) * 1000), tv) for tv in tag_values], reverse=desc, )
def test_hits(self): n = 10 paginator = SequencePaginator([(i, i) for i in range(n)]) assert paginator.get_result(5, count_hits=True).hits == n
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() if count_hits and candidate_ids is None: # If we have no candidates, get a random sample of groups matching # the snuba side of the query, and see how many of those pass the # post-filter in postgres. This should give us an estimate of the # total number of snuba matches that will be overall matches, which # we can use to get an estimate for X-Hits. Note no cursor, so we # are always estimating the total hits. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get('snuba.search.hits-sample-size') snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, **parameters ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) if count_hits: if not more_results: # We know we have got all possible groups from snuba and filtered # them all down, so we have all hits. # TODO this probably doesn't work because we could be on page N # and not be including hits from previous pages. hits = len(result_groups) else: # We also could have underestimated hits from our sample and have # already seen more hits than the estimate, so make sure hits is # at least as big as what we have seen. hits = max(hits, len(result_groups)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results