예제 #1
0
def inbox_search(
    projects: Sequence[Project],
    environments: Optional[Sequence[Environment]] = None,
    limit: int = 100,
    cursor: Optional[Cursor] = None,
    count_hits: bool = False,
    search_filters: Optional[Sequence[SearchFilter]] = None,
    date_from: Optional[datetime] = None,
    date_to: Optional[datetime] = None,
    max_hits: Optional[int] = None,
) -> CursorResult:
    now: datetime = timezone.now()
    end: Optional[datetime] = None
    end_params: List[datetime] = [
        _f for _f in [date_to,
                      get_search_filter(search_filters, "date", "<")] if _f
    ]
    if end_params:
        end = min(end_params)

    end = end if end else now + ALLOWED_FUTURE_DELTA

    # We only want to search back a week at most, since that's the oldest inbox rows
    # can be.
    earliest_date = now - timedelta(days=7)
    start_params = [
        date_from, earliest_date,
        get_search_filter(search_filters, "date", ">")
    ]
    start = max([_f for _f in start_params if _f])
    end = max([earliest_date, end])

    if start >= end:
        return Paginator(Group.objects.none()).get_result()

    # Make sure search terms are valid
    invalid_search_terms = [
        str(sf) for sf in search_filters
        if sf.key.name not in allowed_inbox_search_terms
    ]
    if invalid_search_terms:
        raise InvalidSearchQuery(
            f"Invalid search terms for 'inbox' search: {invalid_search_terms}")

    # Make sure this is an inbox search
    if not get_search_filter(search_filters, "for_review", "="):
        raise InvalidSearchQuery(
            "Sort key 'inbox' only supported for inbox search")

    if get_search_filter(search_filters, "status",
                         "=") != GroupStatus.UNRESOLVED:
        raise InvalidSearchQuery(
            "Inbox search only works for 'unresolved' status")

    # We just filter on `GroupInbox.date_added` here, and don't filter by date
    # on the group. This keeps the query simpler and faster in some edge cases,
    # and date_added is a good enough proxy when we're using this sort.
    qs = GroupInbox.objects.filter(
        date_added__gte=start,
        date_added__lte=end,
        project__in=projects,
    )

    if environments is not None:
        environment_ids: List[int] = [
            environment.id for environment in environments
        ]
        qs = qs.filter(group_id__in=GroupEnvironment.objects.filter(
            environment_id__in=environment_ids).values_list(
                "group_id", flat=True).distinct())

    owner_search = get_search_filter(search_filters, "assigned_or_suggested",
                                     "=")
    if owner_search:
        qs = qs.filter(
            assigned_or_suggested_filter(owner_search,
                                         projects,
                                         field_filter="group_id"))

    paginator = DateTimePaginator(qs.order_by("date_added"), "-date_added")
    results = paginator.get_result(limit,
                                   cursor,
                                   count_hits=count_hits,
                                   max_hits=max_hits)

    # We want to return groups from the endpoint, but have the cursor be related to the
    # GroupInbox rows. So we paginate on the GroupInbox results queryset, then fetch
    # the group_ids out and use them to get the actual groups.
    group_qs = Group.objects.filter(
        id__in=[r.group_id for r in results.results],
        project__in=projects,
        status=GroupStatus.UNRESOLVED,
    )
    groups: Mapping[int, Group] = {g.id: g for g in group_qs}
    results.results = [
        groups[r.group_id] for r in results.results if r.group_id in groups
    ]
    return results
예제 #2
0
from __future__ import absolute_import
import time
from datetime import timedelta
from hashlib import md5

from django.utils import timezone

from sentry import options
from sentry.api.event_search import convert_search_filter_to_snuba_query
from sentry.api.paginator import DateTimePaginator, SequencePaginator, Paginator
from sentry.constants import ALLOWED_FUTURE_DELTA
from sentry.models import Group
from sentry.utils import snuba, metrics
from sentry.snuba.dataset import Dataset

EMPTY_RESULT = Paginator(Group.objects.none()).get_result()

# mapping from query parameter sort name to underlying scoring aggregation name
sort_strategies = {
    "date": "last_seen",
    "freq": "times_seen",
    "new": "first_seen",
    "priority": "priority",
}

dependency_aggregations = {"priority": ["last_seen", "times_seen"]}

aggregation_defs = {
    "times_seen": ["count()", ""],
    "first_seen": ["multiply(toUInt64(min(timestamp)), 1000)", ""],
    "last_seen": ["multiply(toUInt64(max(timestamp)), 1000)", ""],
예제 #3
0
파일: backend.py 프로젝트: mnoble/sentry
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the future, but apparently
        #       `retention_window_start` can be None?
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # pre-filter query
        candidate_hashes = dict(
            GroupHash.objects.filter(
                group__in=group_queryset
            ).values_list(
                'hash', 'group_id'
            )[:MAX_PRE_SNUBA_CANDIDATES + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_hashes))

        if not candidate_hashes:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates')
            return Paginator(Group.objects.none()).get_result()
        elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates')
            candidate_hashes = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        # {group_id: group_score, ...}
        snuba_groups = snuba_search(
            project_id=project.id,
            environment_id=environment and environment.id,
            tags=tags,
            start=start,
            end=end,
            sort=sort,
            extra_aggregations=extra_aggregations,
            score_fn=score_fn,
            candidate_hashes=candidate_hashes,
            **parameters
        )
        metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))

        if candidate_hashes:
            # pre-filtered candidates were passed down to Snuba,
            # so we're finished with filtering
            result_groups = snuba_groups.items()
        else:
            # pre-filtered candidates were *not* passed down to Snuba,
            # so we need to do post-filtering to verify Sentry DB predicates
            result_groups = []
            i = 0
            for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1):
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in chunk]
                ).values_list('id', flat=True)

                result_groups.extend(
                    (group_id, snuba_groups[group_id])
                    for group_id in filtered_group_ids
                )

            metrics.timing('snuba.search.num_post_filters', i)

        paginator_results = SequencePaginator(
            [(score, id) for (id, score) in result_groups],
            reverse=True,
            **paginator_options
        ).get_result(limit, cursor, count_hits=count_hits)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
예제 #4
0
 def empty_result(self):
     return Paginator(Group.objects.none()).get_result()
예제 #5
0
    def query(self,
              project,
              query=None,
              status=None,
              tags=None,
              bookmarked_by=None,
              assigned_to=None,
              first_release=None,
              sort_by='date',
              date_filter='last_seen',
              date_from=None,
              date_to=None,
              cursor=None,
              limit=100):
        from sentry.models import Group

        queryset = Group.objects.filter(project=project)
        if query:
            # TODO(dcramer): if we want to continue to support search on SQL
            # we should at least optimize this in Postgres so that it does
            # the query filter **after** the index filters, and restricts the
            # result set
            queryset = queryset.filter(
                Q(message__icontains=query) | Q(culprit__icontains=query))

        if status is not None:
            queryset = queryset.filter(status=status)

        if bookmarked_by:
            queryset = queryset.filter(
                bookmark_set__project=project,
                bookmark_set__user=bookmarked_by,
            )

        if assigned_to:
            queryset = queryset.filter(
                assignee_set__project=project,
                assignee_set__user=assigned_to,
            )

        if first_release:
            queryset = queryset.filter(
                first_release__project=project,
                first_release__version=first_release,
            )

        if tags:
            for k, v in tags.iteritems():
                queryset = queryset.filter(**dict(
                    grouptag__key=k,
                    grouptag__value=v,
                ))

        if date_filter == 'first_seen':
            if date_from and date_to:
                queryset = queryset.filter(
                    first_seen__gte=date_from,
                    first_seen__lte=date_to,
                )
            elif date_from:
                queryset = queryset.filter(first_seen__gte=date_from)
            elif date_to:
                queryset = queryset.filter(first_seen__lte=date_to)
        elif date_filter == 'last_seen':
            if date_from and date_to:
                queryset = queryset.filter(
                    first_seen__gte=date_from,
                    last_seen__lte=date_to,
                )
            elif date_from:
                queryset = queryset.filter(last_seen__gte=date_from)
            elif date_to:
                queryset = queryset.filter(last_seen__lte=date_to)

        engine = get_db_engine('default')
        if engine.startswith('sqlite'):
            score_clause = SQLITE_SORT_CLAUSES[sort_by]
        elif engine.startswith('mysql'):
            score_clause = MYSQL_SORT_CLAUSES[sort_by]
        elif engine.startswith('oracle'):
            score_clause = ORACLE_SORT_CLAUSES[sort_by]
        elif engine in MSSQL_ENGINES:
            score_clause = MSSQL_SORT_CLAUSES[sort_by]
        else:
            score_clause = SORT_CLAUSES[sort_by]

        if sort_by == 'tottime':
            queryset = queryset.filter(time_spent_count__gt=0)
        elif sort_by == 'avgtime':
            queryset = queryset.filter(time_spent_count__gt=0)

        queryset = queryset.extra(select={'sort_value': score_clause}, )

        # HACK: don't sort by the same column twice
        if sort_by == 'date':
            queryset = queryset.order_by('-sort_value')
        else:
            queryset = queryset.order_by('-sort_value', '-last_seen')

        paginator = Paginator(queryset, '-sort_value')
        return paginator.get_result(limit, cursor)
예제 #6
0
    def query(self,
              project,
              tags=None,
              environment=None,
              sort_by='date',
              limit=100,
              cursor=None,
              count_hits=False,
              paginator_options=None,
              **parameters):

        from sentry.models import Group, GroupStatus, GroupSubscription, Release

        if paginator_options is None:
            paginator_options = {}

        if tags is None:
            tags = {}

        try:
            if tags.get('sentry:release') == 'latest':
                tags['sentry:release'] = get_latest_release(
                    project, environment)

            if parameters.get('first_release') == 'latest':
                parameters['first_release'] = get_latest_release(
                    project, environment)
        except Release.DoesNotExist:
            # no matches could possibly be found from this point on
            return Paginator(Group.objects.none()).get_result()

        group_queryset = QuerySetBuilder({
            'query':
            CallbackCondition(
                lambda queryset, query: queryset.filter(
                    Q(message__icontains=query) | Q(culprit__icontains=query),
                ) if query else queryset,
            ),
            'status':
            CallbackCondition(
                lambda queryset, status: queryset.filter(status=status),
            ),
            'bookmarked_by':
            CallbackCondition(
                lambda queryset, user: queryset.filter(
                    bookmark_set__project=project,
                    bookmark_set__user=user,
                ),
            ),
            'assigned_to':
            CallbackCondition(
                functools.partial(assigned_to_filter, project=project), ),
            'unassigned':
            CallbackCondition(
                lambda queryset, unassigned: queryset.filter(
                    assignee_set__isnull=unassigned, ),
            ),
            'subscribed_by':
            CallbackCondition(
                lambda queryset, user: queryset.filter(
                    id__in=GroupSubscription.objects.filter(
                        project=project,
                        user=user,
                        is_active=True,
                    ).values_list('group'), ),
            ),
            'active_at_from':
            ScalarCondition('active_at', 'gt'),
            'active_at_to':
            ScalarCondition('active_at', 'lt'),
        }).build(
            Group.objects.filter(project=project).exclude(status__in=[
                GroupStatus.PENDING_DELETION,
                GroupStatus.DELETION_IN_PROGRESS,
                GroupStatus.PENDING_MERGE,
            ]),
            parameters,
        )

        # filter out groups which are beyond the retention period
        retention = quotas.get_event_retention(
            organization=project.organization)
        if retention:
            retention_window_start = timezone.now() - timedelta(days=retention)
        else:
            retention_window_start = None
        # TODO: This could be optimized when building querysets to identify
        # criteria that are logically impossible (e.g. if the upper bound
        # for last seen is before the retention window starts, no results
        # exist.)
        if retention_window_start:
            group_queryset = group_queryset.filter(
                last_seen__gte=retention_window_start)

        # This is a punt because the SnubaSearchBackend (a subclass) shares so much that it
        # seemed better to handle all the shared initialization and then handoff to the
        # actual backend.
        return self._query(project, retention_window_start, group_queryset,
                           tags, environment, sort_by, limit, cursor,
                           count_hits, paginator_options, **parameters)
예제 #7
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the past, but apparently
        #       `retention_window_start` can be None(?), so we need a fallback.
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # maximum number of Group IDs to send down to Snuba,
        # if more Group ID candidates are found, a "bare" Snuba
        # search is performed and the result groups are then
        # post-filtered via queries to the Sentry DB
        max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates')

        # pre-filter query
        candidate_ids = None
        if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates:
            candidate_ids = list(
                group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_ids))

            if not candidate_ids:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates')
                return Paginator(Group.objects.none()).get_result()
            elif len(candidate_ids) > max_pre_snuba_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `max_pre_snuba_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates')
                candidate_ids = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = Paginator(Group.objects.none()).get_result()
        result_groups = []
        result_group_ids = set()
        min_score = float('inf')
        max_score = -1

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                project_id=project.id,
                environment_id=environment and environment.id,
                tags=tags,
                start=start,
                end=end,
                sort=sort,
                extra_aggregations=extra_aggregations,
                score_fn=score_fn,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

                    # used for cursor logic
                    min_score = min(min_score, group_score)
                    max_score = max(max_score, group_score)

            # HACK: If a cursor is being used and there may be more results available
            # in Snuba, we need to detect whether the cursor's value will be
            # found in the result groups. If it isn't in the results yet we need to
            # continue querying before we hand off to the paginator to decide whether
            # enough results are found or not, otherwise the paginator will happily
            # return `limit` worth of results that don't take the cursor into account
            # at all, since it can't know there are more results available.
            # TODO: If chunked search works in practice we should probably extend the
            # paginator to throw something if the cursor value is never found, or do
            # something other than partially leak internal paginator logic up to here.
            # Or make separate Paginator implementation just for Snuba search?
            if cursor is not None \
                    and not candidate_ids \
                    and more_results:
                if cursor.is_prev and min_score < cursor.value:
                    continue
                elif not cursor.is_prev and max_score > cursor.value:
                    continue

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
예제 #8
0
    def query(self,
              project,
              tags=None,
              environment=None,
              sort_by='date',
              limit=100,
              cursor=None,
              count_hits=False,
              paginator_options=None,
              **parameters):
        from sentry.models import (Environment, Event, Group, GroupEnvironment,
                                   GroupStatus, GroupSubscription, Release)

        if paginator_options is None:
            paginator_options = {}

        if tags is None:
            tags = {}

        try:
            if tags.get('sentry:release') == 'latest':
                tags['sentry:release'] = get_latest_release(
                    project, environment)

            if parameters.get('first_release') == 'latest':
                parameters['first_release'] = get_latest_release(
                    project, environment)
        except Release.DoesNotExist:
            # no matches could possibly be found from this point on
            return Paginator(Group.objects.none()).get_result()

        group_queryset = QuerySetBuilder({
            'query':
            CallbackCondition(
                lambda queryset, query: queryset.filter(
                    Q(message__icontains=query) | Q(culprit__icontains=query),
                ) if query else queryset,
            ),
            'status':
            CallbackCondition(
                lambda queryset, status: queryset.filter(status=status),
            ),
            'bookmarked_by':
            CallbackCondition(
                lambda queryset, user: queryset.filter(
                    bookmark_set__project=project,
                    bookmark_set__user=user,
                ),
            ),
            'assigned_to':
            CallbackCondition(
                functools.partial(assigned_to_filter, project=project), ),
            'unassigned':
            CallbackCondition(
                lambda queryset, unassigned: queryset.filter(
                    assignee_set__isnull=unassigned, ),
            ),
            'subscribed_by':
            CallbackCondition(
                lambda queryset, user: queryset.filter(
                    id__in=GroupSubscription.objects.filter(
                        project=project,
                        user=user,
                        is_active=True,
                    ).values_list('group'), ),
            ),
            'active_at_from':
            ScalarCondition('active_at', 'gt'),
            'active_at_to':
            ScalarCondition('active_at', 'lt'),
        }).build(
            Group.objects.filter(project=project).exclude(status__in=[
                GroupStatus.PENDING_DELETION,
                GroupStatus.DELETION_IN_PROGRESS,
                GroupStatus.PENDING_MERGE,
            ]),
            parameters,
        )

        # filter out groups which are beyond the retention period
        retention = quotas.get_event_retention(
            organization=project.organization)
        if retention:
            retention_window_start = timezone.now() - timedelta(days=retention)
            # TODO: This could be optimized when building querysets to identify
            # criteria that are logically impossible (e.g. if the upper bound
            # for last seen is before the retention window starts, no results
            # exist.)
            group_queryset = group_queryset.filter(
                last_seen__gte=retention_window_start)
        else:
            retention_window_start = None

        if environment is not None:
            if 'environment' in tags:
                # TODO: This should probably just overwrite the existing tag,
                # rather than asserting on it, but...?
                assert Environment.objects.get(
                    projects=project,
                    name=tags.pop('environment'),
                ).id == environment.id

            event_queryset_builder = QuerySetBuilder({
                'date_from':
                ScalarCondition('date_added', 'gt'),
                'date_to':
                ScalarCondition('date_added', 'lt'),
            })
            if any(key in parameters
                   for key in event_queryset_builder.conditions.keys()):
                event_queryset = event_queryset_builder.build(
                    tagstore.get_event_tag_qs(
                        project.id,
                        environment.id,
                        'environment',
                        environment.name,
                    ),
                    parameters,
                )
                if retention_window_start is not None:
                    event_queryset = event_queryset.filter(
                        date_added__gte=retention_window_start)

                group_queryset = group_queryset.filter(
                    id__in=list(event_queryset.distinct().values_list(
                        'group_id', flat=True)[:1000]))

            group_queryset = QuerySetBuilder({
                'first_release':
                CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                get_sql_column(GroupEnvironment,
                                               'first_release_id'),
                                get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                get_sql_column(Release, 'organization'), ),
                            '{} = %s'.format(
                                get_sql_column(Release, 'version'), ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
                'times_seen':
                CallbackCondition(
                    # This condition represents the exact number of times that
                    # an issue has been seen in an environment. Since an issue
                    # can't be seen in an environment more times than the issue
                    # was seen overall, we can safely exclude any groups that
                    # don't have at least that many events.
                    lambda queryset, times_seen: queryset.exclude(
                        times_seen__lt=times_seen, ),
                ),
                'times_seen_lower':
                CallbackCondition(
                    # This condition represents the lower threshold for the
                    # number of times an issue has been seen in an environment.
                    # Since an issue can't be seen in an environment more times
                    # than the issue was seen overall, we can safely exclude
                    # any groups that haven't met that threshold.
                    lambda queryset, times_seen: queryset.exclude(
                        times_seen__lt=times_seen, ),
                ),
                # The following conditions make a few assertions that are are
                # correct in an abstract sense but may not accurately reflect
                # the existing implementation (see GH-5289). These assumptions
                # are that 1. The first seen time for a Group is the minimum
                # value of the first seen time for all of it's GroupEnvironment
                # relations; 2. The last seen time for a Group is the maximum
                # value of the last seen time for all of it's GroupEnvironment
                # relations; 3. The first seen time is always less than or
                # equal to the last seen time.
                'age_from':
                CallbackCondition(
                    # This condition represents the lower threshold for "first
                    # seen" time for an environment. Due to assertions #1 and
                    # #3, we can exclude any groups where the "last seen" time
                    # is prior to this timestamp.
                    lambda queryset, first_seen: queryset.exclude(
                        last_seen__lt=first_seen, ),
                ),
                'age_to':
                CallbackCondition(
                    # This condition represents the upper threshold for "first
                    # seen" time for an environment. Due to assertions #1, we
                    # can exclude any values where the group first seen is
                    # greater than that threshold.
                    lambda queryset, first_seen: queryset.exclude(
                        first_seen__gt=first_seen, ),
                ),
                'last_seen_from':
                CallbackCondition(
                    # This condition represents the lower threshold for "last
                    # seen" time for an environment. Due to assertion #2, we
                    # can exclude any values where the group last seen value is
                    # less than that threshold.
                    lambda queryset, last_seen: queryset.exclude(last_seen__lt=
                                                                 last_seen, ),
                ),
                'last_seen_to':
                CallbackCondition(
                    # This condition represents the upper threshold for "last
                    # seen" time for an environment. Due to assertions #2 and
                    # #3, we can exclude any values where the group first seen
                    # value is greater than that threshold.
                    lambda queryset, last_seen: queryset.exclude(first_seen__gt
                                                                 =last_seen, ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        '{} = {}'.format(
                            get_sql_column(Group, 'id'),
                            get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        '{} = %s'.format(
                            get_sql_column(GroupEnvironment,
                                           'environment_id'), ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )

            get_sort_expression, sort_value_to_cursor_value = environment_sort_strategies[
                sort_by]

            group_tag_value_queryset = tagstore.get_group_tag_value_qs(
                project.id,
                set(group_queryset.values_list('id',
                                               flat=True)),  # TODO: Limit?,
                environment.id,
                'environment',
                environment.name,
            )

            if retention_window_start is not None:
                group_tag_value_queryset = group_tag_value_queryset.filter(
                    last_seen__gte=retention_window_start)

            candidates = dict(
                QuerySetBuilder({
                    'age_from':
                    ScalarCondition('first_seen', 'gt'),
                    'age_to':
                    ScalarCondition('first_seen', 'lt'),
                    'last_seen_from':
                    ScalarCondition('last_seen', 'gt'),
                    'last_seen_to':
                    ScalarCondition('last_seen', 'lt'),
                    'times_seen':
                    CallbackCondition(
                        lambda queryset, times_seen: queryset.filter(
                            times_seen=times_seen),
                    ),
                    'times_seen_lower':
                    ScalarCondition('times_seen', 'gt'),
                    'times_seen_upper':
                    ScalarCondition('times_seen', 'lt'),
                }).build(
                    group_tag_value_queryset,
                    parameters,
                ).extra(select={
                    'sort_value':
                    get_sort_expression(group_tag_value_queryset.model),
                }, ).values_list('group_id', 'sort_value'))

            if tags:
                # TODO: `get_group_ids_for_search_filter` should be able to
                # utilize the retention window start parameter for additional
                # optimizations.
                matches = tagstore.get_group_ids_for_search_filter(
                    project.id,
                    environment.id,
                    tags,
                    candidates.keys(),
                    limit=len(candidates),
                )
                for key in set(candidates) - set(matches or []):
                    del candidates[key]

            result = SequencePaginator([(sort_value_to_cursor_value(score), id)
                                        for (id, score) in candidates.items()],
                                       reverse=True,
                                       **paginator_options).get_result(
                                           limit,
                                           cursor,
                                           count_hits=count_hits)

            groups = Group.objects.in_bulk(result.results)
            result.results = [groups[k] for k in result.results if k in groups]

            return result
        else:
            event_queryset_builder = QuerySetBuilder({
                'date_from':
                ScalarCondition('datetime', 'gt'),
                'date_to':
                ScalarCondition('datetime', 'lt'),
            })
            if any(key in parameters
                   for key in event_queryset_builder.conditions.keys()):
                group_queryset = group_queryset.filter(id__in=list(
                    event_queryset_builder.build(
                        Event.objects.filter(project_id=project.id),
                        parameters,
                    ).distinct().values_list('group_id', flat=True)[:1000], ))

            group_queryset = QuerySetBuilder({
                'first_release':
                CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
                'age_from':
                ScalarCondition('first_seen', 'gt'),
                'age_to':
                ScalarCondition('first_seen', 'lt'),
                'last_seen_from':
                ScalarCondition('last_seen', 'gt'),
                'last_seen_to':
                ScalarCondition('last_seen', 'lt'),
                'times_seen':
                CallbackCondition(
                    lambda queryset, times_seen: queryset.filter(times_seen=
                                                                 times_seen),
                ),
                'times_seen_lower':
                ScalarCondition('times_seen', 'gt'),
                'times_seen_upper':
                ScalarCondition('times_seen', 'lt'),
            }).build(
                group_queryset,
                parameters,
            ).extra(select={
                'sort_value': get_sort_clause(sort_by),
            }, )

            if tags:
                matches = tagstore.get_group_ids_for_search_filter(
                    project.id, None, tags)
                if matches:
                    group_queryset = group_queryset.filter(id__in=matches)
                else:
                    group_queryset = group_queryset.none()

            paginator_cls, sort_clause = sort_strategies[sort_by]
            group_queryset = group_queryset.order_by(sort_clause)
            paginator = paginator_cls(group_queryset, sort_clause,
                                      **paginator_options)
            return paginator.get_result(limit, cursor, count_hits=count_hits)
예제 #9
0
    def query(self, project, query=None, status=None, tags=None,
              bookmarked_by=None, assigned_to=None, sort_by='date',
              date_filter='last_seen', date_from=None, date_to=None,
              cursor=None, limit=100):
        from sentry.models import Group

        queryset = Group.objects.filter(project=project)
        if query:
            # TODO(dcramer): if we want to continue to support search on SQL
            # we should at least optimize this in Postgres so that it does
            # the query filter **after** the index filters, and restricts the
            # result set
            queryset = queryset.filter(
                Q(message__icontains=query) |
                Q(culprit__icontains=query)
            )

        if status is not None:
            queryset = queryset.filter(status=status)

        if bookmarked_by:
            queryset = queryset.filter(
                bookmark_set__project=project,
                bookmark_set__user=bookmarked_by,
            )

        if assigned_to:
            queryset = queryset.filter(
                assignee_set__project=project,
                assignee_set__user=assigned_to,
            )

        if tags:
            for k, v in tags.iteritems():
                queryset = queryset.filter(**dict(
                    grouptag__key=k,
                    grouptag__value=v,
                ))

        if date_filter == 'first_seen':
            if date_from and date_to:
                queryset = queryset.filter(
                    first_seen__gte=date_from,
                    first_seen__lte=date_to,
                )
            elif date_from:
                queryset = queryset.filter(first_seen__gte=date_from)
            elif date_to:
                queryset = queryset.filter(first_seen__lte=date_to)
        elif date_filter == 'last_seen':
            if date_from and date_to:
                queryset = queryset.filter(
                    first_seen__gte=date_from,
                    last_seen__lte=date_to,
                )
            elif date_from:
                queryset = queryset.filter(last_seen__gte=date_from)
            elif date_to:
                queryset = queryset.filter(last_seen__lte=date_to)

        engine = get_db_engine('default')
        if engine.startswith('sqlite'):
            score_clause = SQLITE_SORT_CLAUSES[sort_by]
        elif engine.startswith('mysql'):
            score_clause = MYSQL_SORT_CLAUSES[sort_by]
        elif engine.startswith('oracle'):
            score_clause = ORACLE_SORT_CLAUSES[sort_by]
        elif engine in MSSQL_ENGINES:
            score_clause = MSSQL_SORT_CLAUSES[sort_by]
        else:
            score_clause = SORT_CLAUSES[sort_by]

        if sort_by == 'tottime':
            queryset = queryset.filter(time_spent_count__gt=0)
        elif sort_by == 'avgtime':
            queryset = queryset.filter(time_spent_count__gt=0)

        queryset = queryset.extra(
            select={'sort_value': score_clause},
        )

        # HACK: don't sort by the same column twice
        if sort_by == 'date':
            queryset = queryset.order_by('-sort_value')
        else:
            queryset = queryset.order_by('-sort_value', '-last_seen')

        paginator = Paginator(queryset, '-sort_value')
        return paginator.get_result(limit, cursor)
예제 #10
0
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the past, but apparently
        #       `retention_window_start` can be None(?), so we need a fallback.
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # maximum number of GroupHashes to send down to Snuba,
        # if more GroupHash candidates are found, a "bare" Snuba
        # search is performed and the result groups are then
        # post-filtered via queries to the Sentry DB
        max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates')

        # pre-filter query
        candidate_hashes = None
        if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates:
            candidate_hashes = dict(
                GroupHash.objects.filter(
                    group__in=group_queryset
                ).values_list(
                    'hash', 'group_id'
                )[:max_pre_snuba_candidates + 1]
            )
            metrics.timing('snuba.search.num_candidates', len(candidate_hashes))

            if not candidate_hashes:
                # no matches could possibly be found from this point on
                metrics.incr('snuba.search.no_candidates')
                return Paginator(Group.objects.none()).get_result()
            elif len(candidate_hashes) > max_pre_snuba_candidates:
                # If the pre-filter query didn't include anything to significantly
                # filter down the number of results (from 'first_release', 'query',
                # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
                # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
                # might have surpassed the `max_pre_snuba_candidates`. In this case,
                # we *don't* want to pass candidates down to Snuba, and instead we
                # want Snuba to do all the filtering/sorting it can and *then* apply
                # this queryset to the results from Snuba, which we call
                # post-filtering.
                metrics.incr('snuba.search.too_many_candidates')
                candidate_hashes = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        chunk_growth = options.get('snuba.search.chunk-growth-rate')
        max_chunk_size = options.get('snuba.search.max-chunk-size')
        chunk_limit = limit
        offset = 0
        num_chunks = 0

        paginator_results = Paginator(Group.objects.none()).get_result()
        result_groups = []
        result_group_ids = set()
        min_score = float('inf')
        max_score = -1

        max_time = options.get('snuba.search.max-total-chunk-time-seconds')
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's hashes and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_hashes always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_hashes) if candidate_hashes else 0)

            # {group_id: group_score, ...}
            snuba_groups, more_results = snuba_search(
                project_id=project.id,
                environment_id=environment and environment.id,
                tags=tags,
                start=start,
                end=end,
                sort=sort,
                extra_aggregations=extra_aggregations,
                score_fn=score_fn,
                candidate_hashes=candidate_hashes,
                limit=chunk_limit,
                offset=offset,
                **parameters
            )
            metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_hashes:
                # pre-filtered candidates were passed down to Snuba,
                # so we're finished with filtering and these are the
                # only results
                result_groups = snuba_groups
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]
                ).values_list('id', flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

                    # used for cursor logic
                    min_score = min(min_score, group_score)
                    max_score = max(max_score, group_score)

            # HACK: If a cursor is being used and there may be more results available
            # in Snuba, we need to detect whether the cursor's value will be
            # found in the result groups. If it isn't in the results yet we need to
            # continue querying before we hand off to the paginator to decide whether
            # enough results are found or not, otherwise the paginator will happily
            # return `limit` worth of results that don't take the cursor into account
            # at all, since it can't know there are more results available.
            # TODO: If chunked search works in practice we should probably extend the
            # paginator to throw something if the cursor value is never found, or do
            # something other than partially leak internal paginator logic up to here.
            # Or make separate Paginator implementation just for Snuba search?
            if cursor is not None \
                    and not candidate_hashes \
                    and more_results:
                if cursor.is_prev and min_score < cursor.value:
                    continue
                elif not cursor.is_prev and max_score > cursor.value:
                    continue

            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options
            ).get_result(limit, cursor, count_hits=False)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_hashes \
                    or len(paginator_results.results) >= limit \
                    or not more_results:
                break

        metrics.timing('snuba.search.num_chunks', num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
예제 #11
0
    def query(self,
              projects,
              tags=None,
              environments=None,
              sort_by='date',
              limit=100,
              cursor=None,
              count_hits=False,
              paginator_options=None,
              search_filters=None,
              use_new_filters=False,
              **parameters):

        from sentry.models import Group, GroupStatus, GroupSubscription, Release

        search_filters = search_filters if search_filters is not None else []

        # ensure projects are from same org
        if len({p.organization_id for p in projects}) != 1:
            raise RuntimeError('Cross organization search not supported')

        if paginator_options is None:
            paginator_options = {}

        if tags is None:
            tags = {}

        try:
            if tags.get('sentry:release') == 'latest':
                tags['sentry:release'] = get_latest_release(
                    projects, environments)

            if parameters.get('first_release') == 'latest':
                parameters['first_release'] = get_latest_release(
                    projects, environments)
        except Release.DoesNotExist:
            # no matches could possibly be found from this point on
            return Paginator(Group.objects.none()).get_result()

        group_queryset = Group.objects.filter(project__in=projects).exclude(
            status__in=[
                GroupStatus.PENDING_DELETION,
                GroupStatus.DELETION_IN_PROGRESS,
                GroupStatus.PENDING_MERGE,
            ])

        if use_new_filters:
            query_set_builder_class = SearchFilterQuerySetBuilder
            query_set_builder_params = search_filters
        else:
            query_set_builder_class = NewQuerySetBuilder
            query_set_builder_params = parameters

        group_queryset = query_set_builder_class({
            'message':
            QCallbackCondition(
                lambda query:
                Q(Q(message__icontains=query) | Q(culprit__icontains=query), ),
                skip_if_falsey=True,
            ),
            # TODO: Remove this once we've stopped using old params
            'query':
            QCallbackCondition(
                lambda query:
                Q(Q(message__icontains=query) | Q(culprit__icontains=query), ),
                skip_if_falsey=True,
            ),
            'status':
            QCallbackCondition(lambda status: Q(status=status), ),
            'bookmarked_by':
            QCallbackCondition(
                lambda user: Q(
                    bookmark_set__project__in=projects,
                    bookmark_set__user=user,
                ), ),
            'assigned_to':
            QCallbackCondition(
                functools.partial(assigned_to_filter, projects=projects), ),
            'unassigned':
            QCallbackCondition(
                functools.partial(unassigned_filter, projects=projects), ),
            'subscribed_by':
            QCallbackCondition(
                lambda user: Q(id__in=GroupSubscription.objects.filter(
                    project__in=projects,
                    user=user,
                    is_active=True,
                ).values_list('group'), ), ),
            'active_at':
            SearchFilterScalarCondition('active_at'),
            # TODO: These are legacy params. Once we've moved to SearchFilter
            # entirely then they can be removed, since the `'active_at'`
            # condition will handle both
            'active_at_from':
            ScalarCondition('active_at', 'gt'),
            'active_at_to':
            ScalarCondition('active_at', 'lt'),
        }).build(group_queryset, query_set_builder_params)

        # filter out groups which are beyond the retention period
        retention = quotas.get_event_retention(
            organization=projects[0].organization)
        if retention:
            retention_window_start = timezone.now() - timedelta(days=retention)
        else:
            retention_window_start = None
        # TODO: This could be optimized when building querysets to identify
        # criteria that are logically impossible (e.g. if the upper bound
        # for last seen is before the retention window starts, no results
        # exist.)
        if retention_window_start:
            group_queryset = group_queryset.filter(
                last_seen__gte=retention_window_start)

        # This is a punt because the SnubaSearchBackend (a subclass) shares so much that it
        # seemed better to handle all the shared initialization and then handoff to the
        # actual backend.
        return self._query(projects, retention_window_start, group_queryset,
                           tags, environments, sort_by, limit, cursor,
                           count_hits, paginator_options, search_filters,
                           use_new_filters, **parameters)