예제 #1
0
파일: store.py 프로젝트: rlugojr/sentry
def save_event(cache_key=None, data=None, start_time=None, event_id=None, **kwargs):
    """
    Saves an event to the database.
    """
    from sentry.event_manager import EventManager

    if cache_key:
        data = default_cache.get(cache_key)

    if event_id is None and data is not None:
        event_id = data['event_id']

    if data is None:
        metrics.incr('events.failed', tags={'reason': 'cache', 'stage': 'post'})
        return

    project = data.pop('project')

    delete_raw_event(project, event_id)

    Raven.tags_context({
        'project': project,
    })

    try:
        manager = EventManager(data)
        manager.save(project)
    finally:
        if cache_key:
            default_cache.delete(cache_key)
        if start_time:
            metrics.timing('events.time-to-process', time() - start_time,
                           instance=data['platform'])
예제 #2
0
파일: redis.py 프로젝트: faulkner/sentry
    def process_pending(self):
        client = self.cluster.get_routing_client()
        lock_key = self._make_lock_key(self.pending_key)
        # prevent a stampede due to celerybeat + periodic task
        if not client.set(lock_key, '1', nx=True, ex=60):
            return

        try:
            keycount = 0
            with self.cluster.all() as conn:
                results = conn.zrange(self.pending_key, 0, -1)

            with self.cluster.all() as conn:
                for host_id, keys in six.iteritems(results.value):
                    if not keys:
                        continue
                    keycount += len(keys)
                    for key in keys:
                        process_incr.apply_async(kwargs={
                            'key': key,
                        })
                    conn.target([host_id]).zrem(self.pending_key, *keys)
            metrics.timing('buffer.pending-size', keycount)
        finally:
            client.delete(lock_key)
예제 #3
0
파일: retries.py 프로젝트: Kayle009/sentry
 def __call__(self, function):
     start = self.clock.time()
     try:
         for i in itertools.count(1):
             try:
                 return function()
             except self.exceptions as error:
                 delay = self.delay(i)
                 now = self.clock.time()
                 if (now + delay) > (start + self.timeout):
                     raise RetryException(
                         'Could not successfully execute %r within %.3f seconds (%s attempts.)' %
                         (function, now - start, i),
                         error,
                     )
                 else:
                     logger.debug(
                         'Failed to execute %r due to %r on attempt #%s, retrying in %s seconds...',
                         function,
                         error,
                         i,
                         delay,
                     )
                     self.clock.sleep(delay)
     finally:
         if self.metric_instance:
             metrics.timing(
                 'timedretrypolicy.duration',
                 self.clock.time() - start,
                 instance=self.metric_instance,
                 tags=self.metric_tags,
             )
예제 #4
0
파일: file.py 프로젝트: alexandrul/sentry
    def putfile(self, fileobj, blob_size=DEFAULT_BLOB_SIZE, commit=True):
        """
        Save a fileobj into a number of chunks.

        Returns a list of `FileBlobIndex` items.

        >>> indexes = file.putfile(fileobj)
        """
        results = []
        offset = 0
        checksum = sha1(b'')

        while True:
            contents = fileobj.read(blob_size)
            if not contents:
                break
            checksum.update(contents)

            blob_fileobj = ContentFile(contents)
            blob = FileBlob.from_file(blob_fileobj)

            results.append(FileBlobIndex.objects.create(
                file=self,
                blob=blob,
                offset=offset,
            ))
            offset += blob.size
        self.size = offset
        self.checksum = checksum.hexdigest()
        metrics.timing('filestore.file-size', offset)
        if commit:
            self.save()
        return results
예제 #5
0
파일: gcs.py 프로젝트: Kayle009/sentry
def try_repeated(func):
    """
    Runs a function a few times ignoring errors we see from GCS
    due to what appears to be network issues.  This is a temporary workaround
    until we can find the root cause.
    """
    if hasattr(func, '__name__'):
        func_name = func.__name__
    elif hasattr(func, 'func'):
        # Partials
        func_name = getattr(func.func, '__name__', '__unknown__')
    else:
        func_name = '__unknown__'

    metrics_key = 'filestore.gcs.retry'
    metrics_tags = {'function': func_name}
    idx = 0
    while True:
        try:
            result = func()
            metrics_tags.update({'success': '1'})
            metrics.timing(metrics_key, idx, tags=metrics_tags)
            return result
        except (DataCorruption, TransportError, RefreshError, RequestException, OpenSSLError) as e:
            if idx >= GCS_RETRIES:
                metrics_tags.update({'success': '0', 'exception_class': e.__class__.__name__})
                metrics.timing(metrics_key, idx, tags=metrics_tags)
                raise
        idx += 1
예제 #6
0
    def putfile(self, fileobj, blob_size=DEFAULT_BLOB_SIZE):
        """
        Save a fileobj into a number of chunks.

        Returns a list of `FileBlobIndex` items.

        >>> indexes = file.putfile(fileobj)
        """
        results = []
        offset = 0
        while True:
            contents = fileobj.read(blob_size)
            if not contents:
                break

            blob_fileobj = ContentFile(contents)
            blob = FileBlob.from_file(blob_fileobj)

            results.append(
                FileBlobIndex.objects.create(
                    file=self,
                    blob=blob,
                    offset=offset,
                )
            )
            offset += blob.size

        metrics.timing('filestore.file-size', offset)
        return results
예제 #7
0
def index_event_tags(organization_id, project_id, event_id, tags,
                     group_id, environment_id, date_added=None, **kwargs):
    from sentry import tagstore

    with configure_scope() as scope:
        scope.set_tag("project", project_id)

    create_event_tags_kwargs = {}
    if date_added is not None:
        create_event_tags_kwargs['date_added'] = date_added

    metrics.timing(
        'tagstore.tags_per_event',
        len(tags),
        tags={
            'organization_id': organization_id,
        }
    )

    tagstore.create_event_tags(
        project_id=project_id,
        group_id=group_id,
        environment_id=environment_id,
        event_id=event_id,
        tags=tags,
        **create_event_tags_kwargs
    )
예제 #8
0
파일: redis.py 프로젝트: daevaorn/sentry
    def process_pending(self):
        client = self.cluster.get_routing_client()
        lock_key = self._make_lock_key(self.pending_key)
        # prevent a stampede due to celerybeat + periodic task
        if not client.set(lock_key, '1', nx=True, ex=60):
            return

        try:
            for host_id in self.cluster.hosts.iterkeys():
                conn = self.cluster.get_local_client(host_id)
                keys = conn.zrange(self.pending_key, 0, -1)
                if not keys:
                    continue
                keycount = 0
                for key in keys:
                    keycount += 1
                    process_incr.apply_async(kwargs={
                        'key': key,
                    })
                pipe = conn.pipeline()
                pipe.zrem(self.pending_key, *keys)
                pipe.execute()
                metrics.timing('buffer.pending-size', keycount)
        finally:
            client.delete(lock_key)
예제 #9
0
def index_event_tags(organization_id, project_id, event_id, tags,
                     group_id, environment_id, date_added=None, **kwargs):
    from sentry import tagstore

    Raven.tags_context({
        'project': project_id,
    })

    create_event_tags_kwargs = {}
    if date_added is not None:
        create_event_tags_kwargs['date_added'] = date_added

    metrics.timing(
        'tagstore.tags_per_event',
        len(tags),
        tags={
            'organization_id': organization_id,
        }
    )

    tagstore.create_event_tags(
        project_id=project_id,
        group_id=group_id,
        environment_id=environment_id,
        event_id=event_id,
        tags=tags,
        **create_event_tags_kwargs
    )
예제 #10
0
    def normalize(self):
        with metrics.timer('events.store.normalize.duration'):
            self._normalize_impl()

        metrics.timing(
            'events.store.normalize.errors',
            len(self._data.get("errors") or ()),
        )
예제 #11
0
파일: base.py 프로젝트: Kayle009/sentry
    def set(self, key, attachments, timeout=None):
        key = self.make_key(key)
        for index, attachment in enumerate(attachments):
            compressed = zlib.compress(attachment.data)
            self.inner.set(u'{}:{}'.format(key, index), compressed, timeout, raw=True)

            metrics_tags = {'type': attachment.type}
            metrics.incr('attachments.received', tags=metrics_tags, skip_internal=False)
            metrics.timing('attachments.blob-size.raw', len(attachment.data), tags=metrics_tags)
            metrics.timing('attachments.blob-size.compressed', len(compressed), tags=metrics_tags)

        meta = [attachment.meta() for attachment in attachments]
        self.inner.set(key, meta, timeout, raw=False)
예제 #12
0
    def _record_time(self, request, status_code):
        if not hasattr(request, "_view_path"):
            return

        metrics.incr(
            "view.response", instance=request._view_path, tags={"method": request.method, "status_code": status_code}
        )

        if not hasattr(request, "_start_time"):
            return

        ms = int((time.time() - request._start_time) * 1000)
        metrics.timing("view.duration", ms, instance=request._view_path, tags={"method": request.method})
예제 #13
0
def _capture_stats(event, is_new):
    # TODO(dcramer): limit platforms to... something?
    group = event.group
    platform = group.platform
    if not platform:
        return
    platform = platform.split('-', 1)[0].split('_', 1)[0]

    if is_new:
        metrics.incr('events.unique')

    metrics.incr('events.processed')
    metrics.incr('events.processed.{platform}'.format(platform=platform))
    metrics.timing('events.size.data', event.size)
예제 #14
0
def _capture_stats(event, is_new):
    group = event.group
    platform = group.platform or group.project.platform
    if not platform:
        return
    platform = PLATFORM_ROOTS.get(platform, platform)
    if platform not in PLATFORM_LIST:
        return

    if is_new:
        metrics.incr("events.unique", 1)

    metrics.incr("events.processed", 1)
    metrics.incr("events.processed.{platform}".format(platform=platform), 1)
    metrics.timing("events.size.data", len(unicode(event.data)))
예제 #15
0
파일: api.py 프로젝트: yaoqi/sentry
    def process(self, request, project, key, auth, helper, data, attachments=None, **kwargs):
        metrics.incr('events.total', skip_internal=False)

        if not data:
            track_outcome(project.organization_id, project.id, key.id, Outcome.INVALID, "no_data")
            raise APIError('No JSON data was found')

        remote_addr = request.META['REMOTE_ADDR']

        event_manager = EventManager(
            data,
            project=project,
            key=key,
            auth=auth,
            client_ip=remote_addr,
            user_agent=helper.context.agent,
            version=auth.version,
            content_encoding=request.META.get('HTTP_CONTENT_ENCODING', ''),
        )
        del data

        self.pre_normalize(event_manager, helper)
        event_manager.normalize()

        data = event_manager.get_data()
        dict_data = dict(data)
        data_size = len(json.dumps(dict_data))

        if data_size > 10000000:
            metrics.timing('events.size.rejected', data_size)
            track_outcome(
                project.organization_id,
                project.id,
                key.id,
                Outcome.INVALID,
                'too_large',
                event_id=dict_data.get('event_id')
            )
            raise APIForbidden("Event size exceeded 10MB after normalization.")

        metrics.timing(
            'events.size.data.post_storeendpoint',
            data_size,
            tags={'project_id': project.id}
        )

        return process_event(event_manager, project,
                             key, remote_addr, helper, attachments)
예제 #16
0
파일: redis.py 프로젝트: alexandrul/sentry
    def process_pending(self, partition=None):
        if partition is None and self.pending_partitions > 1:
            # If we're using partitions, this one task fans out into
            # N subtasks instead.
            for i in range(self.pending_partitions):
                process_pending.apply_async(kwargs={'partition': i})
            # Explicitly also run over the unpartitioned buffer as well
            # to ease in transition. In practice, this should just be
            # super fast and is fine to do redundantly.

        pending_key = self._make_pending_key(partition)
        client = self.cluster.get_routing_client()
        lock_key = self._make_lock_key(pending_key)
        # prevent a stampede due to celerybeat + periodic task
        if not client.set(lock_key, '1', nx=True, ex=60):
            return

        pending_buffer = PendingBuffer(self.incr_batch_size)

        try:
            keycount = 0
            with self.cluster.all() as conn:
                results = conn.zrange(pending_key, 0, -1)

            with self.cluster.all() as conn:
                for host_id, keys in six.iteritems(results.value):
                    if not keys:
                        continue
                    keycount += len(keys)
                    for key in keys:
                        pending_buffer.append(key)
                        if pending_buffer.full():
                            process_incr.apply_async(
                                kwargs={
                                    'batch_keys': pending_buffer.flush(),
                                }
                            )
                    conn.target([host_id]).zrem(pending_key, *keys)

            # queue up remainder of pending keys
            if not pending_buffer.empty():
                process_incr.apply_async(kwargs={
                    'batch_keys': pending_buffer.flush(),
                })

            metrics.timing('buffer.pending-size', keycount)
        finally:
            client.delete(lock_key)
예제 #17
0
def _capture_stats(event, is_new):
    # TODO(dcramer): limit platforms to... something?
    group = event.group
    platform = group.platform
    if not platform:
        return
    platform = platform.split('-', 1)[0].split('_', 1)[0]
    tags = {
        'platform': platform,
    }

    if is_new:
        metrics.incr('events.unique', tags=tags, skip_internal=False)

    metrics.incr('events.processed', tags=tags, skip_internal=False)
    metrics.incr(u'events.processed.{platform}'.format(platform=platform), skip_internal=False)
    metrics.timing('events.size.data', event.size, tags=tags)
예제 #18
0
파일: store.py 프로젝트: alshopov/sentry
def save_event(cache_key=None, data=None, start_time=None, event_id=None, **kwargs):
    """
    Saves an event to the database.
    """
    from sentry.event_manager import HashDiscarded, EventManager
    from sentry import tsdb

    if cache_key:
        data = default_cache.get(cache_key)

    if event_id is None and data is not None:
        event_id = data['event_id']

    if data is None:
        metrics.incr('events.failed', tags={'reason': 'cache', 'stage': 'post'})
        return

    project = data.pop('project')

    delete_raw_event(project, event_id, allow_hint_clear=True)

    Raven.tags_context({
        'project': project,
    })

    try:
        manager = EventManager(data)
        manager.save(project)
    except HashDiscarded as exc:
        # TODO(jess): remove this before it goes out to a wider audience
        info_logger.info(
            'discarded.hash', extra={
                'project_id': project,
                'description': exc.message,
            }
        )
        tsdb.incr(tsdb.models.project_total_received_discarded, project, timestamp=start_time)
    finally:
        if cache_key:
            default_cache.delete(cache_key)
        if start_time:
            metrics.timing(
                'events.time-to-process',
                time() - start_time,
                instance=data['platform'])
예제 #19
0
파일: file.py 프로젝트: haojiang1/sentry
    def from_file(cls, fileobj):
        """
        Retrieve a FileBlob instance for the given file.

        If not already present, this will cause it to be stored.

        >>> blob = FileBlob.from_file(fileobj)
        """
        size = 0
        checksum = sha1('')
        for chunk in fileobj:
            size += len(chunk)
            checksum.update(chunk)
        checksum = checksum.hexdigest()

        lock_key = 'fileblob:upload:{}'.format(checksum)
        # TODO(dcramer): the database here is safe, but if this lock expires
        # and duplicate files are uploaded then we need to prune one
        with Lock(lock_key, timeout=600):
            # test for presence
            try:
                existing = FileBlob.objects.get(checksum=checksum)
            except FileBlob.DoesNotExist:
                pass
            else:
                return existing

            blob = cls(
                size=size,
                checksum=checksum,
                storage=settings.SENTRY_FILESTORE,
                storage_options=settings.SENTRY_FILESTORE_OPTIONS,
            )

            blob.path = cls.generate_unique_path(blob.timestamp)

            storage = blob.get_storage()
            storage.save(blob.path, fileobj)
            blob.save()

        metrics.timing('filestore.blob-size', blob.size)
        return blob
예제 #20
0
파일: file.py 프로젝트: alexandrul/sentry
    def from_file(cls, fileobj):
        """
        Retrieve a list of FileBlobIndex instances for the given file.

        If not already present, this will cause it to be stored.

        >>> blobs = FileBlob.from_file(fileobj)
        """
        size = 0

        checksum = sha1(b'')
        for chunk in fileobj:
            size += len(chunk)
            checksum.update(chunk)
        checksum = checksum.hexdigest()

        # TODO(dcramer): the database here is safe, but if this lock expires
        # and duplicate files are uploaded then we need to prune one
        lock = locks.get(u'fileblob:upload:{}'.format(checksum), duration=60 * 10)
        with TimedRetryPolicy(60)(lock.acquire):
            # test for presence
            try:
                existing = FileBlob.objects.get(checksum=checksum)
            except FileBlob.DoesNotExist:
                pass
            else:
                return existing

            blob = cls(
                size=size,
                checksum=checksum,
            )

            blob.path = cls.generate_unique_path(blob.timestamp)

            storage = get_storage()
            storage.save(blob.path, fileobj)
            blob.save()

        metrics.timing('filestore.blob-size', size)
        return blob
예제 #21
0
    def _record_time(self, request, status_code):
        if not hasattr(request, '_view_path'):
            return

        metrics.incr(
            'view.response',
            instance=request._view_path,
            tags={
                'method': request.method,
                'status_code': status_code,
            }
        )

        if not hasattr(request, '_start_time'):
            return

        ms = int((time.time() - request._start_time) * 1000)
        metrics.timing(
            'view.duration', ms, instance=request._view_path, tags={
                'method': request.method,
            }
        )
예제 #22
0
def save_event(cache_key=None, data=None, start_time=None, **kwargs):
    """
    Saves an event to the database.
    """
    from sentry.event_manager import EventManager

    if cache_key:
        data = default_cache.get(cache_key)

    if data is None:
        return

    project = data.pop('project')

    try:
        manager = EventManager(data)
        manager.save(project)
    finally:
        if cache_key:
            default_cache.delete(cache_key)
        if start_time:
            metrics.timing('events.time-to-process', time() - start_time)
예제 #23
0
def get_task_kwargs_for_message(value):
    """
    Decodes a message body, returning a dictionary of keyword arguments that
    can be applied to a post-processing task, or ``None`` if no task should be
    dispatched.
    """

    metrics.timing('evenstream.events.size.data', len(value))
    payload = json.loads(value)

    try:
        version = payload[0]
    except Exception:
        raise InvalidPayload('Received event payload with unexpected structure')

    try:
        handler = version_handlers[int(version)]
    except (ValueError, KeyError):
        raise InvalidVersion(
            'Received event payload with unexpected version identifier: {}'.format(version))

    return handler(*payload[1:])
예제 #24
0
    def get(self, request, public_key, minified):
        """Returns a js file that can be integrated into a website"""
        start_time = time.time()
        key = None

        try:
            key = ProjectKey.objects.get_from_cache(
                public_key=public_key
            )
        except ProjectKey.DoesNotExist:
            pass

        context, sdk_version, sdk_url = self._get_context(key)

        instance = "default"
        if not sdk_url:
            instance = "noop"
            tmpl = 'sentry/js-sdk-loader-noop.js.tmpl'
        elif minified is not None:
            instance = "minified"
            tmpl = 'sentry/js-sdk-loader.min.js.tmpl'
        else:
            tmpl = 'sentry/js-sdk-loader.js.tmpl'

        metrics.incr('js-sdk-loader.rendered', instance=instance, skip_internal=False)

        response = render_to_response(tmpl, context, content_type="text/javascript")

        response['Access-Control-Allow-Origin'] = '*'
        response['Cache-Control'] = CACHE_CONTROL
        if sdk_version and key:
            response['Surrogate-Key'] = 'project/%s sdk/%s sdk-loader' % (
                key.project_id, sdk_version)

        ms = int((time.time() - start_time) * 1000)
        metrics.timing('js-sdk-loader.duration', ms, instance=instance)

        return response
예제 #25
0
def scrub_data(project, event):
    for config in get_all_pii_configs(project):
        metrics.timing(
            "datascrubbing.config.num_applications", len(config.get("applications") or ())
        )
        total_rules = 0
        for selector, rules in (config.get("applications") or {}).items():
            metrics.timing("datascrubbing.config.selectors.size", len(selector))
            metrics.timing("datascrubbing.config.rules_per_selector.size", len(rules))
            total_rules += len(rules)

        metrics.timing("datascrubbing.config.rules.size", total_rules)

        event = sentry_relay.pii_strip_event(config, event)

    return event
예제 #26
0
    def from_file(cls, fileobj, logger=nooplogger):
        """
        Retrieve a single FileBlob instances for the given file.
        """
        logger.debug("FileBlob.from_file.start")

        size, checksum = _get_size_and_checksum(fileobj)

        # TODO(dcramer): the database here is safe, but if this lock expires
        # and duplicate files are uploaded then we need to prune one
        with _locked_blob(checksum, logger=logger) as existing:
            if existing is not None:
                return existing

            blob = cls(size=size, checksum=checksum)
            blob.path = cls.generate_unique_path()
            storage = get_storage()
            storage.save(blob.path, fileobj)
            blob.save()

        metrics.timing("filestore.blob-size", size)
        logger.debug("FileBlob.from_file.end")
        return blob
예제 #27
0
def _capture_stats(event, is_new):
    # TODO(dcramer): limit platforms to... something?
    platform = event.group.platform if event.group else event.platform
    if not platform:
        return
    platform = platform.split("-", 1)[0].split("_", 1)[0]
    tags = {"platform": platform}

    if is_new:
        metrics.incr("events.unique", tags=tags, skip_internal=False)

    metrics.incr("events.processed", tags=tags, skip_internal=False)
    metrics.incr(u"events.processed.{platform}".format(platform=platform),
                 skip_internal=False)
    metrics.timing("events.size.data", event.size, tags=tags)

    # This is an experiment to understand whether we have, in production,
    # mismatches between event and group before we permanently rely on events
    # for the platform. before adding some more verbose logging on this
    # case, using a stats will give us a sense of the magnitude of the problem.
    if event.group:
        if event.group.platform != event.platform:
            metrics.incr("events.platform_mismatch", tags=tags)
예제 #28
0
파일: file.py 프로젝트: liang0/sentry-1
 def _upload_and_pend_chunk(fileobj, size, checksum, lock):
     logger.debug(
         "FileBlob.from_files._upload_and_pend_chunk.start",
         extra={
             "checksum": checksum,
             "size": size
         },
     )
     blob = cls(size=size, checksum=checksum)
     blob.path = cls.generate_unique_path()
     storage = get_storage()
     storage.save(blob.path, fileobj)
     blobs_to_save.append((blob, lock))
     metrics.timing("filestore.blob-size",
                    size,
                    tags={"function": "from_files"})
     logger.debug(
         "FileBlob.from_files._upload_and_pend_chunk.end",
         extra={
             "checksum": checksum,
             "path": blob.path
         },
     )
예제 #29
0
    def process_pending(self):
        client = self.cluster.get_routing_client()
        lock_key = self._make_lock_key(self.pending_key)
        # prevent a stampede due to celerybeat + periodic task
        if not client.set(lock_key, '1', nx=True, ex=60):
            return

        pending_buffer = PendingBuffer(self.incr_batch_size)

        try:
            keycount = 0
            with self.cluster.all() as conn:
                results = conn.zrange(self.pending_key, 0, -1)

            with self.cluster.all() as conn:
                for host_id, keys in six.iteritems(results.value):
                    if not keys:
                        continue
                    keycount += len(keys)
                    for key in keys:
                        pending_buffer.append(key)
                        if pending_buffer.full():
                            process_incr.apply_async(
                                kwargs={
                                    'batch_keys': pending_buffer.flush(),
                                })
                    conn.target([host_id]).zrem(self.pending_key, *keys)

            # queue up remainder of pending keys
            if not pending_buffer.empty():
                process_incr.apply_async(kwargs={
                    'batch_keys': pending_buffer.flush(),
                })

            metrics.timing('buffer.pending-size', keycount)
        finally:
            client.delete(lock_key)
예제 #30
0
    def get(self, request, public_key, minified):
        """Returns a js file that can be integrated into a website"""
        start_time = time.time()
        key = None

        try:
            key = ProjectKey.objects.get_from_cache(public_key=public_key)
        except ProjectKey.DoesNotExist:
            pass
        else:
            key.project = Project.objects.get_from_cache(id=key.project_id)

        context, sdk_version, sdk_url = self._get_context(key)

        instance = "default"
        if not sdk_url:
            instance = "noop"
            tmpl = "sentry/js-sdk-loader-noop.js.tmpl"
        elif minified is not None:
            instance = "minified"
            tmpl = "sentry/js-sdk-loader.min.js.tmpl"
        else:
            tmpl = "sentry/js-sdk-loader.js.tmpl"

        metrics.incr("js-sdk-loader.rendered", instance=instance, skip_internal=False)

        response = render_to_response(tmpl, context, content_type="text/javascript")

        response["Access-Control-Allow-Origin"] = "*"
        response["Cache-Control"] = CACHE_CONTROL
        if sdk_version and key:
            response["Surrogate-Key"] = f"project/{key.project_id} sdk/{sdk_version} sdk-loader"

        ms = int((time.time() - start_time) * 1000)
        metrics.timing("js-sdk-loader.duration", ms, instance=instance)

        return response
예제 #31
0
    def _record_time(self, request, status_code):
        if not hasattr(request, '_view_path'):
            return

        metrics.incr(
            'view.response',
            instance=request._view_path,
            tags={
                'method': request.method,
                'status_code': status_code,
            },
            skip_internal=False,
        )

        if not hasattr(request, '_start_time'):
            return

        ms = int((time.time() - request._start_time) * 1000)
        metrics.timing('view.duration',
                       ms,
                       instance=request._view_path,
                       tags={
                           'method': request.method,
                       })
예제 #32
0
파일: file.py 프로젝트: alexandrul/sentry
    def assemble_from_file_blob_ids(self, file_blob_ids, checksum, commit=True):
        """
        This creates a file, from file blobs and returns a temp file with the
        contents.
        """
        tf = tempfile.NamedTemporaryFile()
        with transaction.atomic():
            file_blobs = FileBlob.objects.filter(id__in=file_blob_ids).all()
            # Make sure the blobs are sorted with the order provided
            file_blobs = sorted(file_blobs, key=lambda blob: file_blob_ids.index(blob.id))

            new_checksum = sha1(b'')
            offset = 0
            for blob in file_blobs:
                FileBlobIndex.objects.create(
                    file=self,
                    blob=blob,
                    offset=offset,
                )
                for chunk in blob.getfile().chunks():
                    new_checksum.update(chunk)
                    tf.write(chunk)
                offset += blob.size

            self.size = offset
            self.checksum = new_checksum.hexdigest()

            if checksum != self.checksum:
                raise AssembleChecksumMismatch('Checksum mismatch')

        metrics.timing('filestore.file-size', offset)
        if commit:
            self.save()
        tf.flush()
        tf.seek(0)
        return tf
예제 #33
0
    def __call__(self, function):
        start = self.clock.time()
        try:
            for i in itertools.count(1):
                try:
                    return function()
                except self.exceptions as error:
                    if self.log_original_error:
                        logger.info(error)
                    delay = self.delay(i)
                    now = self.clock.time()
                    if (now + delay) > (start + self.timeout):
                        raise RetryException(
                            "Could not successfully execute %r within %.3f seconds (%s attempts.)"
                            % (function, now - start, i),
                            error,
                        )
                    else:
                        logger.debug(
                            "Failed to execute %r due to %r on attempt #%s, retrying in %s seconds...",
                            function,
                            error,
                            i,
                            delay,
                        )
                        self.clock.sleep(delay)
        finally:
            if self.metric_instance:
                from sentry.utils import metrics

                metrics.timing(
                    "timedretrypolicy.duration",
                    self.clock.time() - start,
                    instance=self.metric_instance,
                    tags=self.metric_tags,
                )
예제 #34
0
파일: file.py 프로젝트: yakatz/sentry
    def assemble_from_file_blob_ids(self, file_blob_ids, checksum, commit=True):
        """
        This creates a file, from file blobs and returns a temp file with the
        contents.
        """
        tf = tempfile.NamedTemporaryFile()
        with transaction.atomic():
            file_blobs = FileBlob.objects.filter(id__in=file_blob_ids).all()
            # Make sure the blobs are sorted with the order provided
            file_blobs = sorted(file_blobs, key=lambda blob: file_blob_ids.index(blob.id))

            new_checksum = sha1(b'')
            offset = 0
            for blob in file_blobs:
                FileBlobIndex.objects.create(
                    file=self,
                    blob=blob,
                    offset=offset,
                )
                for chunk in blob.getfile().chunks():
                    new_checksum.update(chunk)
                    tf.write(chunk)
                offset += blob.size

            self.size = offset
            self.checksum = new_checksum.hexdigest()

            if checksum != self.checksum:
                raise AssembleChecksumMismatch('Checksum mismatch')

        metrics.timing('filestore.file-size', offset)
        if commit:
            self.save()
        tf.flush()
        tf.seek(0)
        return tf
예제 #35
0
파일: protocol.py 프로젝트: yzdann/sentry
def get_task_kwargs_for_message(value):
    """
    Decodes a message body, returning a dictionary of keyword arguments that
    can be applied to a post-processing task, or ``None`` if no task should be
    dispatched.
    """

    metrics.timing("eventstream.events.size.data", len(value))
    payload = json.loads(value)

    try:
        version = payload[0]
    except Exception:
        raise InvalidPayload(
            "Received event payload with unexpected structure")

    try:
        handler = version_handlers[int(version)]
    except (ValueError, KeyError):
        raise InvalidVersion(
            "Received event payload with unexpected version identifier: {}".
            format(version))

    return handler(*payload[1:])
예제 #36
0
파일: file.py 프로젝트: yzdann/sentry
    def putfile(self,
                fileobj,
                blob_size=DEFAULT_BLOB_SIZE,
                commit=True,
                logger=nooplogger):
        """
        Save a fileobj into a number of chunks.

        Returns a list of `FileBlobIndex` items.

        >>> indexes = file.putfile(fileobj)
        """
        results = []
        offset = 0
        checksum = sha1(b"")

        while True:
            contents = fileobj.read(blob_size)
            if not contents:
                break
            checksum.update(contents)

            blob_fileobj = ContentFile(contents)
            blob = FileBlob.from_file(blob_fileobj, logger=logger)

            results.append(
                FileBlobIndex.objects.create(file=self,
                                             blob=blob,
                                             offset=offset))
            offset += blob.size
        self.size = offset
        self.checksum = checksum.hexdigest()
        metrics.timing("filestore.file-size", offset)
        if commit:
            self.save()
        return results
예제 #37
0
def scrub_data(project_config, event, in_processing=False, old_event=None):
    for config in get_all_pii_configs(project_config):
        metrics.timing(
            "datascrubbing.config.num_applications", len(config.get("applications") or ())
        )
        total_rules = 0
        for selector, rules in six.iteritems(config.get("applications") or {}):
            metrics.timing("datascrubbing.config.selectors.size", len(selector))
            metrics.timing("datascrubbing.config.rules_per_selector.size", len(rules))
            total_rules += len(rules)

        metrics.timing("datascrubbing.config.rules.size", total_rules)

        if in_processing:
            assert old_event is not None
            config = _narrow_pii_config_for_processing(config, old_event, event)

        event = sentry_relay.pii_strip_event(config, event)

    return event
예제 #38
0
파일: cleanup.py 프로젝트: getsentry/sentry
def cleanup(days, project, concurrency, silent, model, router, timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    os.environ['_SENTRY_CLEANUP'] = '1'

    # Make sure we fork off multiprocessing pool
    # before we import or configure the app
    from multiprocessing import Process, JoinableQueue as Queue

    pool = []
    task_queue = Queue(1000)
    for _ in xrange(concurrency):
        p = Process(target=multiprocess_worker, args=(task_queue,))
        p.daemon = True
        p.start()
        pool.append(p)

    from sentry.runner import configure
    configure()

    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventMapping, 'date_added', '-date_added'),
        (models.EventAttachment, 'date_added', None),
        (models.UserReport, 'date_added', None),
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', 'datetime'),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

    if is_filtered(models.OrganizationMember) and not silent:
        click.echo('>> Skipping OrganizationMember')
    else:
        click.echo('Removing expired values for OrganizationMember')
        expired_threshold = timezone.now() - timedelta(days=days)
        models.OrganizationMember.delete_expired(expired_threshold)

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo(u'Removing expired values for {}'.format(model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo(u'>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(
                expires_at__lt=(timezone.now() - timedelta(days=days)),
            ).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection", err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo(
                "NodeStore backend does not support cleanup operation", err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                )
            )
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                )
            )

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            imp = '.'.join((model.__module__, model.__name__))

            q = BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            )

            for chunk in q.iterator(chunk_size=100):
                task_queue.put((imp, chunk))

            task_queue.join()

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    # Shut down our pool
    for _ in pool:
        task_queue.put(_STOP_WORKER)

    # And wait for it to drain
    for p in pool:
        p.join()

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration', duration, instance=router)
        click.echo("Clean up took %s second(s)." % duration)
예제 #39
0
def callback_timing(
    context: Context,
    method_name: str,
    callargs: Mapping[str, Any],
    backend_names: Sequence[str],
    results: Sequence[TimedFuture],
    metric_name: str,
    result_comparator: Optional[Callable[[str, str, str, Any, Any], Mapping[str, str]]] = None,
    sample_rate: Optional[float] = None,
) -> None:
    """
    Collects timing stats on results returned to the callback method of a `ServiceDelegator`. Either
    partial this and pass it directly as the `callback_func` or
    :param metric_name: Prefix to use when writing these timing metrics to Datadog
    :param method_name: method_name passed to callback
    :param backend_names: backend_names passed to callback
    :param results: results passed to callback
    :param result_comparator: An optional comparator to compare the primary result to each secondary
    result. Should return a dict represents the result of the comparison. This will be merged into
    tags to be stored in the metrics backend.
    :return:
    """
    if not len(backend_names) > 1:
        return
    primary_backend_name = backend_names[0]
    primary_future = results[0]
    primary_status = get_future_status(primary_future)
    primary_timing = primary_future.get_timing()

    # If either endpoint of the timing data is not set, just ignore this call.
    # This really shouldn't happen on the primary backend, but playing it safe
    # here out of an abundance of caution.
    if not all(primary_timing):
        logger.warning(
            "Received timing with unexpected endpoint: %r, primary_backend_name: %r, future_status: %r",
            primary_timing,
            primary_backend_name,
            primary_status,
        )
        return

    primary_duration_ms = (primary_timing[1] - primary_timing[0]) * 1000

    metric_kwargs = {}
    if sample_rate is not None:
        metric_kwargs["sample_rate"] = sample_rate

    metrics.timing(
        f"{metric_name}.timing_ms",
        primary_duration_ms,
        tags={
            "method": method_name,
            "backend": primary_backend_name,
            "status": primary_status,
            "primary": "true",
        },
        **metric_kwargs,
    )

    for i, secondary_backend_name in enumerate(backend_names[1:], 1):
        secondary_future = results[i]
        secondary_timing = secondary_future.get_timing()
        secondary_status = get_future_status(secondary_future)

        tags = {
            "method": method_name,
            "primary_backend": primary_backend_name,
            "primary_status": primary_status,
            "secondary_backend": secondary_backend_name,
            "secondary_status": secondary_status,
        }

        if result_comparator:
            comparator_result = result_comparator(
                method_name,
                primary_status,
                secondary_status,
                primary_future.result(),
                secondary_future.result(),
            )
            tags.update(comparator_result)

        # If either endpoint of the timing data is not set, this means
        # something weird happened (more than likely a cancellation.)
        if not all(secondary_timing):
            metrics.incr(
                f"{metric_name}.timing_invalid",
                tags={**tags, "reason": get_invalid_timing_reason(secondary_timing)},
            )
        else:
            secondary_duration_ms = (secondary_timing[1] - secondary_timing[0]) * 1000
            metrics.timing(
                f"{metric_name}.timing_ms",
                secondary_duration_ms,
                tags={
                    "method": method_name,
                    "backend": secondary_backend_name,
                    "status": secondary_status,
                    "primary": "false",
                },
                **metric_kwargs,
            )
            metrics.timing(
                f"{metric_name}.timing_delta_ms",
                secondary_duration_ms - primary_duration_ms,
                tags=tags,
                **metric_kwargs,
            )
            metrics.timing(
                f"{metric_name}.timing_relative_delta",
                secondary_duration_ms / primary_duration_ms,
                tags=tags,
                **metric_kwargs,
            )
예제 #40
0
def _do_save_event(cache_key=None,
                   data=None,
                   start_time=None,
                   event_id=None,
                   project_id=None,
                   **kwargs):
    """
    Saves an event to the database.
    """
    from sentry.event_manager import HashDiscarded, EventManager, track_outcome
    from sentry import quotas
    from sentry.models import ProjectKey

    if cache_key and data is None:
        data = default_cache.get(cache_key)

    if data is not None:
        data = CanonicalKeyDict(data)

    if event_id is None and data is not None:
        event_id = data['event_id']

    # only when we come from reprocessing we get a project_id sent into
    # the task.
    if project_id is None:
        project_id = data.pop('project')

    key_id = None if data is None else data.get('key_id')
    timestamp = to_datetime(start_time) if start_time is not None else None

    delete_raw_event(project_id, event_id, allow_hint_clear=True)

    # This covers two cases: where data is None because we did not manage
    # to fetch it from the default cache or the empty dictionary was
    # stored in the default cache.  The former happens if the event
    # expired while being on the queue, the second happens on reprocessing
    # if the raw event was deleted concurrently while we held on to
    # it.  This causes the node store to delete the data and we end up
    # fetching an empty dict.  We could in theory not invoke `save_event`
    # in those cases but it's important that we always clean up the
    # reprocessing reports correctly or they will screw up the UI.  So
    # to future proof this correctly we just handle this case here.
    if not data:
        metrics.incr('events.failed',
                     tags={
                         'reason': 'cache',
                         'stage': 'post'
                     },
                     skip_internal=False)
        return

    with configure_scope() as scope:
        scope.set_tag("project", project_id)

    event = None
    try:
        manager = EventManager(data)
        event = manager.save(project_id, assume_normalized=True)

        # Always load attachments from the cache so we can later prune them.
        # Only save them if the event-attachments feature is active, though.
        if features.has('organizations:event-attachments',
                        event.project.organization,
                        actor=None):
            attachments = attachment_cache.get(cache_key) or []
            for attachment in attachments:
                save_attachment(event, attachment)

        # This is where we can finally say that we have accepted the event.
        track_outcome(event.project.organization_id, event.project.id, key_id,
                      'accepted', None, timestamp)

    except HashDiscarded:
        project = Project.objects.get_from_cache(id=project_id)
        reason = FilterStatKeys.DISCARDED_HASH
        project_key = None
        try:
            if key_id is not None:
                project_key = ProjectKey.objects.get_from_cache(id=key_id)
        except ProjectKey.DoesNotExist:
            pass

        quotas.refund(project, key=project_key, timestamp=start_time)
        track_outcome(project.organization_id, project_id, key_id, 'filtered',
                      reason, timestamp)

    finally:
        if cache_key:
            default_cache.delete(cache_key)

            # For the unlikely case that we did not manage to persist the
            # event we also delete the key always.
            if event is None or \
               features.has('organizations:event-attachments', event.project.organization, actor=None):
                attachment_cache.delete(cache_key)

        if start_time:
            metrics.timing('events.time-to-process',
                           time() - start_time,
                           instance=data['platform'])
예제 #41
0
    def save(self,
             project_id,
             raw=False,
             assume_normalized=False,
             cache_key=None):
        """
        After normalizing and processing an event, save adjacent models such as
        releases and environments to postgres and write the event into
        eventstream. From there it will be picked up by Snuba and
        post-processing.

        We re-insert events with duplicate IDs into Snuba, which is responsible
        for deduplicating events. Since deduplication in Snuba is on the primary
        key (based on event ID, project ID and day), events with same IDs are only
        deduplicated if their timestamps fall on the same day. The latest event
        always wins and overwrites the value of events received earlier in that day.

        Since we increment counters and frequencies here before events get inserted
        to eventstream these numbers may be larger than the total number of
        events if we receive duplicate event IDs that fall on the same day
        (that do not hit cache first).
        """

        # Normalize if needed
        if not self._normalized:
            if not assume_normalized:
                self.normalize()
            self._normalized = True

        with metrics.timer("event_manager.save.project.get_from_cache"):
            project = Project.objects.get_from_cache(id=project_id)

        with metrics.timer("event_manager.save.organization.get_from_cache"):
            project._organization_cache = Organization.objects.get_from_cache(
                id=project.organization_id)

        job = {"data": self._data, "project_id": project_id, "raw": raw}
        jobs = [job]
        projects = {project.id: project}

        _pull_out_data(jobs, projects)

        # Right now the event type is the signal to skip the group. This
        # is going to change a lot.
        if job["event"].get_event_type() == "transaction":
            issueless_event = True
        else:
            issueless_event = False

        _get_or_create_release_many(jobs, projects)

        # XXX: remove
        if job["dist"] and job["release"]:
            job["dist"] = job["release"].add_dist(job["dist"],
                                                  job["event"].datetime)
            # dont allow a conflicting 'dist' tag
            pop_tag(job["data"], "dist")
            set_tag(job["data"], "sentry:dist", job["dist"].name)
        else:
            job["dist"] = None

        _get_event_user_many(jobs, projects)

        with metrics.timer("event_manager.load_grouping_config"):
            # At this point we want to normalize the in_app values in case the
            # clients did not set this appropriately so far.
            grouping_config = load_grouping_config(
                get_grouping_config_dict_for_event_data(job["data"], project))

        with metrics.timer("event_manager.normalize_stacktraces_for_grouping"):
            normalize_stacktraces_for_grouping(job["data"], grouping_config)

        _derive_plugin_tags_many(jobs, projects)
        _derive_interface_tags_many(jobs)

        with metrics.timer("event_manager.apply_server_fingerprinting"):
            # The active grouping config was put into the event in the
            # normalize step before.  We now also make sure that the
            # fingerprint was set to `'{{ default }}' just in case someone
            # removed it from the payload.  The call to get_hashes will then
            # look at `grouping_config` to pick the right parameters.
            job["data"]["fingerprint"] = job["data"].get("fingerprint") or [
                "{{ default }}"
            ]
            apply_server_fingerprinting(
                job["data"], get_fingerprinting_config_for_project(project))

        with metrics.timer("event_manager.event.get_hashes"):
            # Here we try to use the grouping config that was requested in the
            # event.  If that config has since been deleted (because it was an
            # experimental grouping config) we fall back to the default.
            try:
                hashes = job["event"].get_hashes()
            except GroupingConfigNotFound:
                job["data"][
                    "grouping_config"] = get_grouping_config_dict_for_project(
                        project)
                hashes = job["event"].get_hashes()

        job["data"]["hashes"] = hashes

        _materialize_metadata_many(jobs)

        job["received_timestamp"] = received_timestamp = job["event"].data.get(
            "received") or float(job["event"].datetime.strftime("%s"))

        if not issueless_event:
            # The group gets the same metadata as the event when it's flushed but
            # additionally the `last_received` key is set.  This key is used by
            # _save_aggregate.
            group_metadata = dict(job["materialized_metadata"])
            group_metadata["last_received"] = received_timestamp
            kwargs = {
                "platform": job["platform"],
                "message": job["event"].search_message,
                "culprit": job["culprit"],
                "logger": job["logger_name"],
                "level": LOG_LEVELS_MAP.get(job["level"]),
                "last_seen": job["event"].datetime,
                "first_seen": job["event"].datetime,
                "active_at": job["event"].datetime,
                "data": group_metadata,
            }

            if job["release"]:
                kwargs["first_release"] = job["release"]

            try:
                job["group"], job["is_new"], job[
                    "is_regression"] = _save_aggregate(event=job["event"],
                                                       hashes=hashes,
                                                       release=job["release"],
                                                       **kwargs)
            except HashDiscarded:
                event_discarded.send_robust(project=project,
                                            sender=EventManager)

                metrics.incr(
                    "events.discarded",
                    skip_internal=True,
                    tags={
                        "organization_id": project.organization_id,
                        "platform": job["platform"]
                    },
                )
                raise
            job["event"].group = job["group"]
        else:
            job["group"] = None
            job["is_new"] = False
            job["is_regression"] = False

        _send_event_saved_signal_many(jobs, projects)

        # store a reference to the group id to guarantee validation of isolation
        # XXX(markus): No clue what this does
        job["event"].data.bind_ref(job["event"])

        _get_or_create_environment_many(jobs, projects)

        if job["group"]:
            group_environment, job[
                "is_new_group_environment"] = GroupEnvironment.get_or_create(
                    group_id=job["group"].id,
                    environment_id=job["environment"].id,
                    defaults={"first_release": job["release"] or None},
                )
        else:
            job["is_new_group_environment"] = False

        _get_or_create_release_associated_models(jobs, projects)

        if job["release"] and job["group"]:
            job["grouprelease"] = GroupRelease.get_or_create(
                group=job["group"],
                release=job["release"],
                environment=job["environment"],
                datetime=job["event"].datetime,
            )

        _tsdb_record_all_metrics(jobs)

        if job["group"]:
            UserReport.objects.filter(project=project,
                                      event_id=job["event"].event_id).update(
                                          group=job["group"],
                                          environment=job["environment"])

        # Enusre the _metrics key exists. This is usually created during
        # and prefilled with ingestion sizes.
        event_metrics = job["event"].data.get("_metrics") or {}
        job["event"].data["_metrics"] = event_metrics

        # Capture the actual size that goes into node store.
        event_metrics["bytes.stored.event"] = len(
            json.dumps(dict(job["event"].data.items())))

        if not issueless_event:
            # Load attachments first, but persist them at the very last after
            # posting to eventstream to make sure all counters and eventstream are
            # incremented for sure.
            attachments = get_attachments(cache_key, job["event"])
            for attachment in attachments:
                key = "bytes.stored.%s" % (attachment.type, )
                event_metrics[key] = (event_metrics.get(key) or 0) + len(
                    attachment.data)

        _nodestore_save_many(jobs)

        if job["release"] and not issueless_event:
            if job["is_new"]:
                buffer.incr(
                    ReleaseProject,
                    {"new_groups": 1},
                    {
                        "release_id": job["release"].id,
                        "project_id": project.id
                    },
                )
            if job["is_new_group_environment"]:
                buffer.incr(
                    ReleaseProjectEnvironment,
                    {"new_issues_count": 1},
                    {
                        "project_id": project.id,
                        "release_id": job["release"].id,
                        "environment_id": job["environment"].id,
                    },
                )

        if not raw:
            if not project.first_event:
                project.update(first_event=job["event"].datetime)
                first_event_received.send_robust(project=project,
                                                 event=job["event"],
                                                 sender=Project)

        _eventstream_insert_many(jobs)

        if not issueless_event:
            # Do this last to ensure signals get emitted even if connection to the
            # file store breaks temporarily.
            save_attachments(attachments, job["event"])

        metric_tags = {"from_relay": "_relay_processed" in job["data"]}

        metrics.timing("events.latency",
                       received_timestamp - job["recorded_timestamp"],
                       tags=metric_tags)
        metrics.timing("events.size.data.post_save",
                       job["event"].size,
                       tags=metric_tags)
        metrics.incr(
            "events.post_save.normalize.errors",
            amount=len(job["data"].get("errors") or ()),
            tags=metric_tags,
        )

        self._data = job["event"].data.data
        return job["event"]
예제 #42
0
파일: backend.py 프로젝트: mnoble/sentry
    def _query(self, project, retention_window_start, group_queryset, tags, environment,
               sort_by, limit, cursor, count_hits, paginator_options, **parameters):

        # TODO: Product decision: we currently search Group.message to handle
        # the `query` parameter, because that's what we've always done. We could
        # do that search against every event in Snuba instead, but results may
        # differ.

        now = timezone.now()
        end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA)
        # TODO: Presumably we want to search back to the project's full retention,
        #       which may be higher than 90 days in the future, but apparently
        #       `retention_window_start` can be None?
        start = max(
            filter(None, [
                retention_window_start,
                parameters.get('date_from'),
                now - timedelta(days=90)
            ])
        )
        assert start < end

        # TODO: It's possible `first_release` could be handled by Snuba.
        if environment is not None:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.extra(
                        where=[
                            '{} = {}'.format(
                                ds.get_sql_column(GroupEnvironment, 'first_release_id'),
                                ds.get_sql_column(Release, 'id'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'organization'),
                            ),
                            '{} = %s'.format(
                                ds.get_sql_column(Release, 'version'),
                            ),
                        ],
                        params=[project.organization_id, version],
                        tables=[Release._meta.db_table],
                    ),
                ),
            }).build(
                group_queryset.extra(
                    where=[
                        u'{} = {}'.format(
                            ds.get_sql_column(Group, 'id'),
                            ds.get_sql_column(GroupEnvironment, 'group_id'),
                        ),
                        u'{} = %s'.format(
                            ds.get_sql_column(GroupEnvironment, 'environment_id'),
                        ),
                    ],
                    params=[environment.id],
                    tables=[GroupEnvironment._meta.db_table],
                ),
                parameters,
            )
        else:
            group_queryset = ds.QuerySetBuilder({
                'first_release': ds.CallbackCondition(
                    lambda queryset, version: queryset.filter(
                        first_release__organization_id=project.organization_id,
                        first_release__version=version,
                    ),
                ),
            }).build(
                group_queryset,
                parameters,
            )

        # pre-filter query
        candidate_hashes = dict(
            GroupHash.objects.filter(
                group__in=group_queryset
            ).values_list(
                'hash', 'group_id'
            )[:MAX_PRE_SNUBA_CANDIDATES + 1]
        )
        metrics.timing('snuba.search.num_candidates', len(candidate_hashes))

        if not candidate_hashes:
            # no matches could possibly be found from this point on
            metrics.incr('snuba.search.no_candidates')
            return Paginator(Group.objects.none()).get_result()
        elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr('snuba.search.too_many_candidates')
            candidate_hashes = None

        sort, extra_aggregations, score_fn = sort_strategies[sort_by]

        # {group_id: group_score, ...}
        snuba_groups = snuba_search(
            project_id=project.id,
            environment_id=environment and environment.id,
            tags=tags,
            start=start,
            end=end,
            sort=sort,
            extra_aggregations=extra_aggregations,
            score_fn=score_fn,
            candidate_hashes=candidate_hashes,
            **parameters
        )
        metrics.timing('snuba.search.num_snuba_results', len(snuba_groups))

        if candidate_hashes:
            # pre-filtered candidates were passed down to Snuba,
            # so we're finished with filtering
            result_groups = snuba_groups.items()
        else:
            # pre-filtered candidates were *not* passed down to Snuba,
            # so we need to do post-filtering to verify Sentry DB predicates
            result_groups = []
            i = 0
            for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1):
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid for gid, _ in chunk]
                ).values_list('id', flat=True)

                result_groups.extend(
                    (group_id, snuba_groups[group_id])
                    for group_id in filtered_group_ids
                )

            metrics.timing('snuba.search.num_post_filters', i)

        paginator_results = SequencePaginator(
            [(score, id) for (id, score) in result_groups],
            reverse=True,
            **paginator_options
        ).get_result(limit, cursor, count_hits=count_hits)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [groups[k] for k in paginator_results.results if k in groups]

        return paginator_results
예제 #43
0
def snuba_search(
    start,
    end,
    project_ids,
    environment_ids,
    sort_field,
    cursor=None,
    candidate_ids=None,
    limit=None,
    offset=0,
    get_sample=False,
    search_filters=None,
):
    """
    This function doesn't strictly benefit from or require being pulled out of the main
    query method above, but the query method is already large and this function at least
    extracts most of the Snuba-specific logic.

    Returns a tuple of:
     * a sorted list of (group_id, group_score) tuples sorted descending by score,
     * the count of total results (rows) available for this query.
    """
    filters = {"project_id": project_ids}

    if environment_ids is not None:
        filters["environment"] = environment_ids

    if candidate_ids:
        filters["group_id"] = sorted(candidate_ids)

    conditions = []
    having = []
    for search_filter in search_filters:
        if (
                # Don't filter on issue fields here, they're not available
                search_filter.key.name in issue_only_fields or
                # We special case date
                search_filter.key.name == "date"):
            continue
        converted_filter = convert_search_filter_to_snuba_query(search_filter)

        # Ensure that no user-generated tags that clashes with aggregation_defs is added to having
        if search_filter.key.name in aggregation_defs and not search_filter.key.is_tag:
            having.append(converted_filter)
        else:
            conditions.append(converted_filter)

    extra_aggregations = dependency_aggregations.get(sort_field, [])
    required_aggregations = set([sort_field, "total"] + extra_aggregations)
    for h in having:
        alias = h[0]
        required_aggregations.add(alias)

    aggregations = []
    for alias in required_aggregations:
        aggregations.append(aggregation_defs[alias] + [alias])

    if cursor is not None:
        having.append(
            (sort_field, ">=" if cursor.is_prev else "<=", cursor.value))

    selected_columns = []
    if get_sample:
        query_hash = md5(repr(conditions)).hexdigest()[:8]
        selected_columns.append(
            ("cityHash64", ("'{}'".format(query_hash), "group_id"), "sample"))
        sort_field = "sample"
        orderby = [sort_field]
        referrer = "search_sample"
    else:
        # Get the top matching groups by score, i.e. the actual search results
        # in the order that we want them.
        orderby = ["-{}".format(sort_field),
                   "group_id"]  # ensure stable sort within the same score
        referrer = "search"

    snuba_results = snuba.dataset_query(
        dataset=Dataset.Events,
        start=start,
        end=end,
        selected_columns=selected_columns,
        groupby=["group_id"],
        conditions=conditions,
        having=having,
        filter_keys=filters,
        aggregations=aggregations,
        orderby=orderby,
        referrer=referrer,
        limit=limit,
        offset=offset,
        totals=
        True,  # Needs to have totals_mode=after_having_exclusive so we get groups matching HAVING only
        turbo=get_sample,  # Turn off FINAL when in sampling mode
        sample=1,  # Don't use clickhouse sampling, even when in turbo mode.
    )
    rows = snuba_results["data"]
    total = snuba_results["totals"]["total"]

    if not get_sample:
        metrics.timing("snuba.search.num_result_groups", len(rows))

    return [(row["group_id"], row[sort_field]) for row in rows], total
예제 #44
0
파일: tasks.py 프로젝트: shantanu73/sentry
def assemble_download(
    data_export_id,
    export_limit=EXPORTED_ROWS_LIMIT,
    batch_size=SNUBA_MAX_RESULTS,
    offset=0,
    bytes_written=0,
    environment_id=None,
    **kwargs
):
    with sentry_sdk.start_transaction(
        op="task.data_export.assemble", name="DataExportAssemble", sampled=True,
    ):
        first_page = offset == 0

        try:
            if first_page:
                logger.info("dataexport.start", extra={"data_export_id": data_export_id})
            data_export = ExportedData.objects.get(id=data_export_id)
            if first_page:
                metrics.incr("dataexport.start", tags={"success": True}, sample_rate=1.0)
            logger.info(
                "dataexport.run", extra={"data_export_id": data_export_id, "offset": offset}
            )
        except ExportedData.DoesNotExist as error:
            if first_page:
                metrics.incr("dataexport.start", tags={"success": False}, sample_rate=1.0)
            logger.exception(error)
            return

        with sentry_sdk.configure_scope() as scope:
            if data_export.user:
                user = {}
                if data_export.user.id:
                    user["id"] = data_export.user.id
                if data_export.user.username:
                    user["username"] = data_export.user.username
                if data_export.user.email:
                    user["email"] = data_export.user.email
                scope.user = user
            scope.set_tag("organization.slug", data_export.organization.slug)
            scope.set_tag("export.type", ExportQueryType.as_str(data_export.query_type))
            scope.set_extra("export.query", data_export.query_info)

        try:
            # ensure that the export limit is set and capped at EXPORTED_ROWS_LIMIT
            if export_limit is None:
                export_limit = EXPORTED_ROWS_LIMIT
            else:
                export_limit = min(export_limit, EXPORTED_ROWS_LIMIT)

            processor = get_processor(data_export, environment_id)

            with tempfile.TemporaryFile(mode="w+b") as tf:
                # XXX(python3):
                #
                # In python2 land we write utf-8 encoded strings as bytes via
                # the csv writer (see convert_to_utf8). The CSV writer will
                # ONLY write bytes, even if you give it unicode it will convert
                # it to bytes.
                #
                # In python3 we write unicode strings (which is all the csv
                # module is able to do, it will NOT write bytes like in py2).
                # Because of this we use the codec getwriter to transform our
                # file handle to a stream writer that will encode to utf8.
                if six.PY2:
                    tfw = tf
                else:
                    tfw = codecs.getwriter("utf-8")(tf)

                writer = csv.DictWriter(tfw, processor.header_fields, extrasaction="ignore")
                if first_page:
                    writer.writeheader()

                # the position in the file at the end of the headers
                starting_pos = tf.tell()

                # the row offset relative to the start of the current task
                # this offset tells you the number of rows written during this batch fragment
                fragment_offset = 0

                # the absolute row offset from the beginning of the export
                next_offset = offset + fragment_offset

                while True:
                    # the number of rows to export in the next batch fragment
                    fragment_row_count = min(batch_size, max(export_limit - next_offset, 1))

                    rows = process_rows(processor, data_export, fragment_row_count, next_offset)
                    writer.writerows(rows)

                    fragment_offset += len(rows)
                    next_offset = offset + fragment_offset

                    if (
                        not rows
                        or len(rows) < batch_size
                        # the batch may exceed MAX_BATCH_SIZE but immediately stops
                        or tf.tell() - starting_pos >= MAX_BATCH_SIZE
                    ):
                        break

                tf.seek(0)
                new_bytes_written = store_export_chunk_as_blob(data_export, bytes_written, tf)
                bytes_written += new_bytes_written
        except ExportError as error:
            return data_export.email_failure(message=six.text_type(error))
        except Exception as error:
            metrics.incr("dataexport.error", tags={"error": six.text_type(error)}, sample_rate=1.0)
            logger.error(
                "dataexport.error: %s",
                six.text_type(error),
                extra={"query": data_export.payload, "org": data_export.organization_id},
            )
            capture_exception(error)

            try:
                current.retry()
            except MaxRetriesExceededError:
                metrics.incr(
                    "dataexport.end",
                    tags={"success": False, "error": six.text_type(error)},
                    sample_rate=1.0,
                )
                return data_export.email_failure(message="Internal processing failure")
        else:
            if (
                rows
                and len(rows) >= batch_size
                and new_bytes_written
                and next_offset < export_limit
            ):
                assemble_download.delay(
                    data_export_id,
                    export_limit=export_limit,
                    batch_size=batch_size,
                    offset=next_offset,
                    bytes_written=bytes_written,
                    environment_id=environment_id,
                )
            else:
                metrics.timing("dataexport.row_count", next_offset, sample_rate=1.0)
                metrics.timing("dataexport.file_size", bytes_written, sample_rate=1.0)
                merge_export_blobs.delay(data_export_id)
예제 #45
0
파일: tasks.py 프로젝트: shantanu73/sentry
def merge_export_blobs(data_export_id, **kwargs):
    with sentry_sdk.start_transaction(
        op="task.data_export.merge", name="DataExportMerge", sampled=True,
    ):
        try:
            data_export = ExportedData.objects.get(id=data_export_id)
        except ExportedData.DoesNotExist as error:
            logger.exception(error)
            return

        with sentry_sdk.configure_scope() as scope:
            if data_export.user:
                user = {}
                if data_export.user.id:
                    user["id"] = data_export.user.id
                if data_export.user.username:
                    user["username"] = data_export.user.username
                if data_export.user.email:
                    user["email"] = data_export.user.email
                scope.user = user
            scope.user = user
            scope.set_tag("organization.slug", data_export.organization.slug)
            scope.set_tag("export.type", ExportQueryType.as_str(data_export.query_type))
            scope.set_extra("export.query", data_export.query_info)

        # adapted from `putfile` in  `src/sentry/models/file.py`
        try:
            with transaction.atomic():
                file = File.objects.create(
                    name=data_export.file_name,
                    type="export.csv",
                    headers={"Content-Type": "text/csv"},
                )
                size = 0
                file_checksum = sha1(b"")

                for export_blob in ExportedDataBlob.objects.filter(
                    data_export=data_export
                ).order_by("offset"):
                    blob = export_blob.blob
                    FileBlobIndex.objects.create(file=file, blob=blob, offset=size)
                    size += blob.size
                    blob_checksum = sha1(b"")

                    for chunk in blob.getfile().chunks():
                        blob_checksum.update(chunk)
                        file_checksum.update(chunk)

                    if blob.checksum != blob_checksum.hexdigest():
                        raise AssembleChecksumMismatch("Checksum mismatch")

                file.size = size
                file.checksum = file_checksum.hexdigest()
                file.save()
                data_export.finalize_upload(file=file)

                time_elapsed = (timezone.now() - data_export.date_added).total_seconds()
                metrics.timing("dataexport.duration", time_elapsed, sample_rate=1.0)
                logger.info("dataexport.end", extra={"data_export_id": data_export_id})
                metrics.incr("dataexport.end", tags={"success": True}, sample_rate=1.0)
        except Exception as error:
            metrics.incr("dataexport.error", tags={"error": six.text_type(error)}, sample_rate=1.0)
            metrics.incr(
                "dataexport.end",
                tags={"success": False, "error": six.text_type(error)},
                sample_rate=1.0,
            )
            logger.error(
                "dataexport.error: %s",
                six.text_type(error),
                extra={"query": data_export.payload, "org": data_export.organization_id},
            )
            capture_exception(error)
            if isinstance(error, IntegrityError):
                message = "Failed to save the assembled file."
            else:
                message = "Internal processing failure."
            return data_export.email_failure(message=message)
예제 #46
0
파일: base.py 프로젝트: 280185386/sentry
def track_memory_usage(metric, **kwargs):
    before = get_rss_usage()
    try:
        yield
    finally:
        metrics.timing(metric, get_rss_usage() - before, **kwargs)
def cleanup(days, project, concurrency, max_procs, silent, model, router,
            timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    import math
    import multiprocessing
    import pickle
    import subprocess
    import sys
    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventMapping, 'date_added', '-date_added'),
        (models.GroupHashTombstone, 'deleted_at', None),
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', 'datetime'),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() -
                                               timedelta(hours=48)).delete()

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo('Removing expired values for {}'.format(model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(expires_at__lt=timezone.now()).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection",
            err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo("NodeStore backend does not support cleanup operation",
                       err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            if concurrency > 1:
                shard_ids = range(concurrency)
                num_procs = min(multiprocessing.cpu_count(), max_procs)
                threads_per_proc = int(
                    math.ceil(concurrency / float(num_procs)))

                pids = []
                for shard_id_chunk in chunker(shard_ids, threads_per_proc):
                    pid = subprocess.Popen([
                        sys.argv[0],
                        'cleanup_chunk',
                        '--days',
                        six.binary_type(days),
                    ] + (
                        ['--project_id',
                         six.binary_type(project_id)] if project_id else []
                    ) + [
                        '--model',
                        pickle.dumps(model),
                        '--dtfield',
                        dtfield,
                        '--order_by',
                        order_by,
                        '--num_shards',
                        six.binary_type(concurrency),
                        '--shard_ids',
                        ",".join([six.binary_type(s) for s in shard_id_chunk]),
                    ])
                    pids.append(pid)

                total_pid_count = len(pids)
                click.echo(
                    "%s concurrent processes forked, waiting on them to complete."
                    % total_pid_count)

                complete = 0
                for pid in pids:
                    pid.wait()
                    complete += 1
                    click.echo("%s/%s concurrent processes are finished." %
                               (complete, total_pid_count))

            else:
                task = create_deletion_task(days, project_id, model, dtfield,
                                            order_by)
                _chunk_until_complete(task)

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration', duration, instance=router)
        click.echo("Clean up took %s second(s)." % duration)
예제 #48
0
파일: backend.py 프로젝트: mnoble/sentry
def snuba_search(project_id, environment_id, tags, start, end,
                 sort, extra_aggregations, score_fn, candidate_hashes, **parameters):
    """
    This function doesn't strictly benefit from or require being pulled out of the main
    query method above, but the query method is already large and this function at least
    extracts most of the Snuba-specific logic.

    Returns an OrderedDict of {group_id: group_score, ...} sorted descending by score.
    """

    from sentry.search.base import ANY

    filters = {
        'project_id': [project_id],
    }

    if environment_id is not None:
        filters['environment'] = [environment_id]

    if candidate_hashes is not None:
        filters['primary_hash'] = candidate_hashes.keys()

    having = SnubaConditionBuilder({
        'age_from': ScalarCondition('first_seen', '>'),
        'age_to': ScalarCondition('first_seen', '<'),
        'last_seen_from': ScalarCondition('last_seen', '>'),
        'last_seen_to': ScalarCondition('last_seen', '<'),
        'times_seen': CallbackCondition(
            lambda times_seen: ('times_seen', '=', times_seen),
        ),
        'times_seen_lower': ScalarCondition('times_seen', '>'),
        'times_seen_upper': ScalarCondition('times_seen', '<'),
    }).build(parameters)

    conditions = []
    for tag, val in six.iteritems(tags):
        col = u'tags[{}]'.format(tag)
        if val == ANY:
            conditions.append((col, '!=', ''))
        else:
            conditions.append((col, '=', val))

    required_aggregations = set([sort] + extra_aggregations)
    for h in having:
        alias = h[0]
        required_aggregations.add(alias)

    aggregations = []
    for alias in required_aggregations:
        aggregations.append(aggregation_defs[alias] + [alias])

    # {hash -> {<agg_alias> -> <agg_value>,
    #           <agg_alias> -> <agg_value>,
    #           ...},
    #  ...}
    # _OR_ if there's only one <agg_alias> in use
    # {hash -> <agg_value>,
    #  ...}
    snuba_results = snuba.query(
        start=start,
        end=end,
        groupby=['primary_hash'],
        conditions=conditions,
        having=having,
        filter_keys=filters,
        aggregations=aggregations,
        orderby='-' + sort,
        referrer='search',
    )
    metrics.timing('snuba.search.num_result_hashes', len(snuba_results.keys()))

    # {hash -> group_id, ...}
    if candidate_hashes is not None:
        # any hash coming back had to come from our candidate set
        hash_to_group = candidate_hashes
    else:
        hash_to_group = dict(
            GroupHash.objects.filter(
                project_id=project_id,
                hash__in=snuba_results.keys()
            ).values_list(
                'hash', 'group_id'
            )
        )

    # {group_id -> {field1: [...all values from field1 for all hashes...],
    #               field2: [...all values from field2 for all hashes...]
    #               ...}
    #  ...}
    group_data = {}
    for hash, obj in snuba_results.items():
        if hash in hash_to_group:
            group_id = hash_to_group[hash]

            if group_id not in group_data:
                group_data[group_id] = defaultdict(list)

            dest = group_data[group_id]

            # NOTE: The Snuba utility code is trying to be helpful by collapsing
            # results with only one aggregate down to the single value. It's a
            # bit of a hack that we then immediately undo that work here, but
            # many other callers get value out of that functionality. If we see
            # this pattern again we should either add an option to opt-out of
            # the 'help' here or remove it from the Snuba code altogether.
            if len(required_aggregations) == 1:
                alias = list(required_aggregations)[0]
                dest[alias].append(obj)
            else:
                for k, v in obj.items():
                    dest[k].append(v)
        else:
            logger.warning(
                'search.hash_not_found',
                extra={
                    'project_id': project_id,
                    'hash': hash,
                },
            )

    return OrderedDict(
        sorted(((gid, score_fn(data))
                for gid, data in group_data.items()), key=lambda t: t[1], reverse=True)
    )
예제 #49
0
    def save(self, project, raw=False):
        from sentry.tasks.post_process import index_event_tags
        data = self.data

        project = Project.objects.get_from_cache(id=project)

        # Check to make sure we're not about to do a bunch of work that's
        # already been done if we've processed an event with this ID. (This
        # isn't a perfect solution -- this doesn't handle ``EventMapping`` and
        # there's a race condition between here and when the event is actually
        # saved, but it's an improvement. See GH-7677.)
        try:
            event = Event.objects.get(
                project_id=project.id,
                event_id=data['event_id'],
            )
        except Event.DoesNotExist:
            pass
        else:
            self.logger.info(
                'duplicate.found',
                exc_info=True,
                extra={
                    'event_uuid': data['event_id'],
                    'project_id': project.id,
                    'model': Event.__name__,
                }
            )
            return event

        # First we pull out our top-level (non-data attr) kwargs
        event_id = data.pop('event_id')
        level = data.pop('level')
        transaction_name = data.pop('transaction', None)
        culprit = data.pop('culprit', None)
        logger_name = data.pop('logger', None)
        server_name = data.pop('server_name', None)
        site = data.pop('site', None)
        checksum = data.pop('checksum', None)
        fingerprint = data.pop('fingerprint', None)
        platform = data.pop('platform', None)
        release = data.pop('release', None)
        dist = data.pop('dist', None)
        environment = data.pop('environment', None)

        # unused
        time_spent = data.pop('time_spent', None)
        message = data.pop('message', '')

        if not culprit:
            if transaction_name:
                culprit = transaction_name
            else:
                culprit = generate_culprit(data, platform=platform)

        culprit = force_text(culprit)
        if transaction_name:
            transaction_name = force_text(transaction_name)

        recorded_timestamp = data.pop('timestamp')
        date = datetime.fromtimestamp(recorded_timestamp)
        date = date.replace(tzinfo=timezone.utc)

        kwargs = {
            'platform': platform,
        }

        event = Event(
            project_id=project.id,
            event_id=event_id,
            data=data,
            time_spent=time_spent,
            datetime=date,
            **kwargs
        )
        event._project_cache = project
        data = event.data.data

        # convert this to a dict to ensure we're only storing one value per key
        # as most parts of Sentry dont currently play well with multiple values
        tags = dict(data.get('tags') or [])
        tags['level'] = LOG_LEVELS[level]
        if logger_name:
            tags['logger'] = logger_name
        if server_name:
            tags['server_name'] = server_name
        if site:
            tags['site'] = site
        if environment:
            tags['environment'] = environment
        if transaction_name:
            tags['transaction'] = transaction_name

        if release:
            # dont allow a conflicting 'release' tag
            if 'release' in tags:
                del tags['release']
            release = Release.get_or_create(
                project=project,
                version=release,
                date_added=date,
            )

            tags['sentry:release'] = release.version

        if dist and release:
            dist = release.add_dist(dist, date)
            tags['sentry:dist'] = dist.name
        else:
            dist = None

        event_user = self._get_event_user(project, data)
        if event_user:
            # dont allow a conflicting 'user' tag
            if 'user' in tags:
                del tags['user']
            tags['sentry:user'] = event_user.tag_value

        # At this point we want to normalize the in_app values in case the
        # clients did not set this appropriately so far.
        normalize_in_app(data)

        for plugin in plugins.for_project(project, version=None):
            added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False)
            if added_tags:
                # plugins should not override user provided tags
                for key, value in added_tags:
                    tags.setdefault(key, value)

        for path, iface in six.iteritems(event.interfaces):
            for k, v in iface.iter_tags():
                tags[k] = v
            # Get rid of ephemeral interface data
            if iface.ephemeral:
                data.pop(iface.get_path(), None)

        # tags are stored as a tuple
        tags = tags.items()

        data['tags'] = tags
        data['fingerprint'] = fingerprint or ['{{ default }}']

        # prioritize fingerprint over checksum as its likely the client defaulted
        # a checksum whereas the fingerprint was explicit
        if fingerprint:
            hashes = [md5_from_hash(h) for h in get_hashes_from_fingerprint(event, fingerprint)]
        elif checksum:
            if HASH_RE.match(checksum):
                hashes = [checksum]
            else:
                hashes = [md5_from_hash([checksum]), checksum]
            data['checksum'] = checksum
        else:
            hashes = [md5_from_hash(h) for h in get_hashes_for_event(event)]

        # TODO(dcramer): temp workaround for complexity
        data['message'] = message
        event_type = eventtypes.get(data.get('type', 'default'))(data)
        event_metadata = event_type.get_metadata()
        # TODO(dcramer): temp workaround for complexity
        del data['message']

        data['type'] = event_type.key
        data['metadata'] = event_metadata

        # index components into ``Event.message``
        # See GH-3248
        if event_type.key != 'default':
            if 'sentry.interfaces.Message' in data and \
                    data['sentry.interfaces.Message']['message'] != message:
                message = u'{} {}'.format(
                    message,
                    data['sentry.interfaces.Message']['message'],
                )

        if not message:
            message = ''
        elif not isinstance(message, six.string_types):
            message = force_text(message)

        for value in six.itervalues(event_metadata):
            value_u = force_text(value, errors='replace')
            if value_u not in message:
                message = u'{} {}'.format(message, value_u)

        if culprit and culprit not in message:
            culprit_u = force_text(culprit, errors='replace')
            message = u'{} {}'.format(message, culprit_u)

        message = trim(message.strip(), settings.SENTRY_MAX_MESSAGE_LENGTH)

        event.message = message
        kwargs['message'] = message

        received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s'))
        group_kwargs = kwargs.copy()
        group_kwargs.update(
            {
                'culprit': culprit,
                'logger': logger_name,
                'level': level,
                'last_seen': date,
                'first_seen': date,
                'active_at': date,
                'data': {
                    'last_received': received_timestamp,
                    'type':
                    event_type.key,
                    # we cache the events metadata on the group to ensure its
                    # accessible in the stream
                    'metadata':
                    event_metadata,
                },
            }
        )

        if release:
            group_kwargs['first_release'] = release

        try:
            group, is_new, is_regression, is_sample = self._save_aggregate(
                event=event, hashes=hashes, release=release, **group_kwargs
            )
        except HashDiscarded:
            event_discarded.send_robust(
                project=project,
                sender=EventManager,
            )

            metrics.incr(
                'events.discarded',
                skip_internal=True,
                tags={
                    'organization_id': project.organization_id,
                    'platform': platform,
                },
            )
            raise
        else:
            event_saved.send_robust(
                project=project,
                event_size=event.size,
                sender=EventManager,
            )

        event.group = group
        # store a reference to the group id to guarantee validation of isolation
        event.data.bind_ref(event)

        # When an event was sampled, the canonical source of truth
        # is the EventMapping table since we aren't going to be writing out an actual
        # Event row. Otherwise, if the Event isn't being sampled, we can safely
        # rely on the Event table itself as the source of truth and ignore
        # EventMapping since it's redundant information.
        if is_sample:
            try:
                with transaction.atomic(using=router.db_for_write(EventMapping)):
                    EventMapping.objects.create(project=project, group=group, event_id=event_id)
            except IntegrityError:
                self.logger.info(
                    'duplicate.found',
                    exc_info=True,
                    extra={
                        'event_uuid': event_id,
                        'project_id': project.id,
                        'group_id': group.id,
                        'model': EventMapping.__name__,
                    }
                )
                return event

        environment = Environment.get_or_create(
            project=project,
            name=environment,
        )

        group_environment, is_new_group_environment = GroupEnvironment.get_or_create(
            group_id=group.id,
            environment_id=environment.id,
            defaults={
                'first_release_id': release.id if release else None,
            },
        )

        if release:
            ReleaseEnvironment.get_or_create(
                project=project,
                release=release,
                environment=environment,
                datetime=date,
            )

            ReleaseProjectEnvironment.get_or_create(
                project=project,
                release=release,
                environment=environment,
                datetime=date,
            )

            grouprelease = GroupRelease.get_or_create(
                group=group,
                release=release,
                environment=environment,
                datetime=date,
            )

        counters = [
            (tsdb.models.group, group.id),
            (tsdb.models.project, project.id),
        ]

        if release:
            counters.append((tsdb.models.release, release.id))

        tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id)

        frequencies = [
            # (tsdb.models.frequent_projects_by_organization, {
            #     project.organization_id: {
            #         project.id: 1,
            #     },
            # }),
            # (tsdb.models.frequent_issues_by_project, {
            #     project.id: {
            #         group.id: 1,
            #     },
            # })
            (tsdb.models.frequent_environments_by_group, {
                group.id: {
                    environment.id: 1,
                },
            })
        ]

        if release:
            frequencies.append(
                (tsdb.models.frequent_releases_by_group, {
                    group.id: {
                        grouprelease.id: 1,
                    },
                })
            )

        tsdb.record_frequency_multi(frequencies, timestamp=event.datetime)

        UserReport.objects.filter(
            project=project,
            event_id=event_id,
        ).update(
            group=group,
            environment=environment,
        )

        # save the event unless its been sampled
        if not is_sample:
            try:
                with transaction.atomic(using=router.db_for_write(Event)):
                    event.save()
            except IntegrityError:
                self.logger.info(
                    'duplicate.found',
                    exc_info=True,
                    extra={
                        'event_uuid': event_id,
                        'project_id': project.id,
                        'group_id': group.id,
                        'model': Event.__name__,
                    }
                )
                return event

            index_event_tags.delay(
                organization_id=project.organization_id,
                project_id=project.id,
                group_id=group.id,
                environment_id=environment.id,
                event_id=event.id,
                tags=tags,
                date_added=event.datetime,
            )

        if event_user:
            tsdb.record_multi(
                (
                    (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )),
                    (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )),
                ),
                timestamp=event.datetime,
                environment_id=environment.id,
            )
        if release:
            if is_new:
                buffer.incr(
                    ReleaseProject, {'new_groups': 1}, {
                        'release_id': release.id,
                        'project_id': project.id,
                    }
                )
            if is_new_group_environment:
                buffer.incr(
                    ReleaseProjectEnvironment, {'new_issues_count': 1}, {
                        'project_id': project.id,
                        'release_id': release.id,
                        'environment_id': environment.id,
                    }
                )

        safe_execute(Group.objects.add_tags, group, environment, tags, _with_transaction=False)

        if not raw:
            if not project.first_event:
                project.update(first_event=date)
                first_event_received.send_robust(project=project, group=group, sender=Project)

        eventstream.insert(
            group=group,
            event=event,
            is_new=is_new,
            is_sample=is_sample,
            is_regression=is_regression,
            is_new_group_environment=is_new_group_environment,
            primary_hash=hashes[0],
            # We are choosing to skip consuming the event back
            # in the eventstream if it's flagged as raw.
            # This means that we want to publish the event
            # through the event stream, but we don't care
            # about post processing and handling the commit.
            skip_consume=raw,
        )

        metrics.timing(
            'events.latency',
            received_timestamp - recorded_timestamp,
            tags={
                'project_id': project.id,
            },
        )

        return event
예제 #50
0
def cleanup(days, project, concurrency, max_procs, silent, model, router, timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    import math
    import multiprocessing
    import pickle
    import subprocess
    import sys
    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventMapping, 'date_added', '-date_added'),
        (models.GroupHashTombstone, 'deleted_at', None),
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', 'datetime'),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(
            date_added__lte=timezone.now() - timedelta(hours=48)
        ).delete()

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo('Removing expired values for {}'.format(model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(expires_at__lt=timezone.now()).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection", err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo(
                "NodeStore backend does not support cleanup operation", err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                )
            )
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                "Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                )
            )

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            if concurrency > 1:
                shard_ids = range(concurrency)
                num_procs = min(multiprocessing.cpu_count(), max_procs)
                threads_per_proc = int(math.ceil(
                    concurrency / float(num_procs)))

                pids = []
                for shard_id_chunk in chunker(shard_ids, threads_per_proc):
                    pid = subprocess.Popen([
                        sys.argv[0],
                        'cleanup_chunk',
                        '--days', six.binary_type(days),
                    ] + (['--project_id', six.binary_type(project_id)] if project_id else []) + [
                        '--model', pickle.dumps(model),
                        '--dtfield', dtfield,
                        '--order_by', order_by,
                        '--num_shards', six.binary_type(concurrency),
                        '--shard_ids', ",".join([six.binary_type(s)
                                                 for s in shard_id_chunk]),
                    ])
                    pids.append(pid)

                total_pid_count = len(pids)
                click.echo(
                    "%s concurrent processes forked, waiting on them to complete." % total_pid_count)

                complete = 0
                for pid in pids:
                    pid.wait()
                    complete += 1
                    click.echo(
                        "%s/%s concurrent processes are finished." % (complete, total_pid_count))

            else:
                task = create_deletion_task(
                    days, project_id, model, dtfield, order_by)
                _chunk_until_complete(task)

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration', duration, instance=router)
        click.echo("Clean up took %s second(s)." % duration)
예제 #51
0
    def query(
        self,
        projects,
        retention_window_start,
        group_queryset,
        environments,
        sort_by,
        limit,
        cursor,
        count_hits,
        paginator_options,
        search_filters,
        date_from,
        date_to,
    ):

        now = timezone.now()
        end = None
        end_params = filter(
            None,
            [date_to, get_search_filter(search_filters, "date", "<")])
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (cursor is None and sort_by == "date" and not environments and
                    # This handles tags and date parameters for search filters.
                    not [
                        sf for sf in search_filters
                        if sf.key.name not in issue_only_fields.union(["date"])
                    ]):
                group_queryset = group_queryset.order_by("-last_seen")
                paginator = DateTimePaginator(group_queryset, "-last_seen",
                                              **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit,
                                            cursor,
                                            count_hits=count_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            filter(None, [retention_window_start, now - timedelta(days=90)]))

        # TODO: We should try and consolidate all this logic together a little
        # better, maybe outside the backend. Should be easier once we're on
        # just the new search filters
        start_params = [
            date_from, retention_date,
            get_search_filter(search_filters, "date", ">")
        ]
        start = max(filter(None, start_params))

        end = max([retention_date, end])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return EMPTY_RESULT

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return EMPTY_RESULT

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get("snuba.search.max-pre-snuba-candidates")
        too_many_candidates = False
        candidate_ids = list(
            group_queryset.values_list("id", flat=True)[:max_candidates + 1])
        metrics.timing("snuba.search.num_candidates", len(candidate_ids))
        if not candidate_ids:
            # no matches could possibly be found from this point on
            metrics.incr("snuba.search.no_candidates", skip_internal=False)
            return EMPTY_RESULT
        elif len(candidate_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'query',
            # 'status', 'bookmarked_by', 'assigned_to', 'unassigned',
            # 'subscribed_by', 'active_at_from', or 'active_at_to') then it
            # might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr("snuba.search.too_many_candidates",
                         skip_internal=False)
            too_many_candidates = True
            candidate_ids = []

        sort_field = sort_strategies[sort_by]
        chunk_growth = options.get("snuba.search.chunk-growth-rate")
        max_chunk_size = options.get("snuba.search.max-chunk-size")
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = None

        paginator_results = EMPTY_RESULT
        result_groups = []
        result_group_ids = set()

        max_time = options.get("snuba.search.max-total-chunk-time-seconds")
        time_start = time.time()

        if count_hits and (too_many_candidates or cursor is not None):
            # If we had too many candidates to reasonably pass down to snuba,
            # or if we have a cursor that bisects the overall result set (such
            # that our query only sees results on one side of the cursor) then
            # we need an alternative way to figure out the total hits that this
            # query has.

            # To do this, we get a sample of groups matching the snuba side of
            # the query, and see how many of those pass the post-filter in
            # postgres. This should give us an estimate of the total number of
            # snuba matches that will be overall matches, which we can use to
            # get an estimate for X-Hits.

            # The sampling is not simple random sampling. It will return *all*
            # matching groups if there are less than N groups matching the
            # query, or it will return a random, deterministic subset of N of
            # the groups if there are more than N overall matches. This means
            # that the "estimate" is actually an accurate result when there are
            # less than N matching groups.

            # The number of samples required to achieve a certain error bound
            # with a certain confidence interval can be calculated from a
            # rearrangement of the normal approximation (Wald) confidence
            # interval formula:
            #
            # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
            #
            # Effectively if we want the estimate to be within +/- 10% of the
            # real value with 95% confidence, we would need (1.96^2 * p*(1-p))
            # / 0.1^2 samples. With a starting assumption of p=0.5 (this
            # requires the most samples) we would need 96 samples to achieve
            # +/-10% @ 95% confidence.

            sample_size = options.get("snuba.search.hits-sample-size")
            snuba_groups, snuba_total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                sort_field=sort_field,
                limit=sample_size,
                offset=0,
                get_sample=True,
                search_filters=search_filters,
            )
            snuba_count = len(snuba_groups)
            if snuba_count == 0:
                return EMPTY_RESULT
            else:
                filtered_count = group_queryset.filter(
                    id__in=[gid for gid, _ in snuba_groups]).count()

                hit_ratio = filtered_count / float(snuba_count)
                hits = int(hit_ratio * snuba_total)

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have candidate_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(candidate_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                sort_field=sort_field,
                cursor=cursor,
                candidate_ids=candidate_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing("snuba.search.num_snuba_results", len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if candidate_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the candidate_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid
                            for gid, _ in snuba_groups]).values_list("id",
                                                                     flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options).get_result(limit, cursor, known_hits=hits)

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            if candidate_ids or len(
                    paginator_results.results) >= limit or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev
                                   or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing("snuba.search.num_chunks", num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]

        return paginator_results
예제 #52
0
    def save(self,
             project_id,
             raw=False,
             assume_normalized=False,
             cache_key=None):
        """
        We re-insert events with duplicate IDs into Snuba, which is responsible
        for deduplicating events. Since deduplication in Snuba is on the primary
        key (based on event ID, project ID and day), events with same IDs are only
        deduplicated if their timestamps fall on the same day. The latest event
        always wins and overwrites the value of events received earlier in that day.

        Since we increment counters and frequencies here before events get inserted
        to eventstream these numbers may be larger than the total number of
        events if we receive duplicate event IDs that fall on the same day
        (that do not hit cache first).
        """

        # Normalize if needed
        if not self._normalized:
            if not assume_normalized:
                self.normalize()
            self._normalized = True

        data = self._data

        project = Project.objects.get_from_cache(id=project_id)
        project._organization_cache = Organization.objects.get_from_cache(
            id=project.organization_id)

        # Pull out the culprit
        culprit = self.get_culprit()

        # Pull the toplevel data we're interested in
        level = data.get("level")

        # TODO(mitsuhiko): this code path should be gone by July 2018.
        # This is going to be fine because no code actually still depends
        # on integers here.  When we need an integer it will be converted
        # into one later.  Old workers used to send integers here.
        if level is not None and isinstance(level, six.integer_types):
            level = LOG_LEVELS[level]

        transaction_name = data.get("transaction")
        logger_name = data.get("logger")
        release = data.get("release")
        dist = data.get("dist")
        environment = data.get("environment")
        recorded_timestamp = data.get("timestamp")

        # We need to swap out the data with the one internal to the newly
        # created event object
        event = self._get_event_instance(project_id=project_id)
        self._data = data = event.data.data

        event._project_cache = project

        date = event.datetime
        platform = event.platform
        event_id = event.event_id

        if transaction_name:
            transaction_name = force_text(transaction_name)

        # Right now the event type is the signal to skip the group. This
        # is going to change a lot.
        if event.get_event_type() == "transaction":
            issueless_event = True
        else:
            issueless_event = False

        # Some of the data that are toplevel attributes are duplicated
        # into tags (logger, level, environment, transaction).  These are
        # different from legacy attributes which are normalized into tags
        # ahead of time (site, server_name).
        setdefault_path(data, "tags", value=[])
        set_tag(data, "level", level)
        if logger_name:
            set_tag(data, "logger", logger_name)
        if environment:
            set_tag(data, "environment", environment)
        if transaction_name:
            set_tag(data, "transaction", transaction_name)

        if release:
            # dont allow a conflicting 'release' tag
            pop_tag(data, "release")
            release = Release.get_or_create(project=project,
                                            version=release,
                                            date_added=date)
            set_tag(data, "sentry:release", release.version)

        if dist and release:
            dist = release.add_dist(dist, date)
            # dont allow a conflicting 'dist' tag
            pop_tag(data, "dist")
            set_tag(data, "sentry:dist", dist.name)
        else:
            dist = None

        event_user = self._get_event_user(project, data)
        if event_user:
            # dont allow a conflicting 'user' tag
            pop_tag(data, "user")
            set_tag(data, "sentry:user", event_user.tag_value)

        # At this point we want to normalize the in_app values in case the
        # clients did not set this appropriately so far.
        grouping_config = load_grouping_config(
            get_grouping_config_dict_for_event_data(data, project))
        normalize_stacktraces_for_grouping(data, grouping_config)

        for plugin in plugins.for_project(project, version=None):
            added_tags = safe_execute(plugin.get_tags,
                                      event,
                                      _with_transaction=False)
            if added_tags:
                # plugins should not override user provided tags
                for key, value in added_tags:
                    if get_tag(data, key) is None:
                        set_tag(data, key, value)

        for path, iface in six.iteritems(event.interfaces):
            for k, v in iface.iter_tags():
                set_tag(data, k, v)
            # Get rid of ephemeral interface data
            if iface.ephemeral:
                data.pop(iface.path, None)

        # The active grouping config was put into the event in the
        # normalize step before.  We now also make sure that the
        # fingerprint was set to `'{{ default }}' just in case someone
        # removed it from the payload.  The call to get_hashes will then
        # look at `grouping_config` to pick the right parameters.
        data["fingerprint"] = data.get("fingerprint") or ["{{ default }}"]
        apply_server_fingerprinting(
            data, get_fingerprinting_config_for_project(project))

        # Here we try to use the grouping config that was requested in the
        # event.  If that config has since been deleted (because it was an
        # experimental grouping config) we fall back to the default.
        try:
            hashes = event.get_hashes()
        except GroupingConfigNotFound:
            data["grouping_config"] = get_grouping_config_dict_for_project(
                project)
            hashes = event.get_hashes()

        data["hashes"] = hashes

        # we want to freeze not just the metadata and type in but also the
        # derived attributes.  The reason for this is that we push this
        # data into kafka for snuba processing and our postprocessing
        # picks up the data right from the snuba topic.  For most usage
        # however the data is dynamically overridden by Event.title and
        # Event.location (See Event.as_dict)
        materialized_metadata = self.materialize_metadata()
        data.update(materialized_metadata)
        data["culprit"] = culprit

        received_timestamp = event.data.get("received") or float(
            event.datetime.strftime("%s"))

        if not issueless_event:
            # The group gets the same metadata as the event when it's flushed but
            # additionally the `last_received` key is set.  This key is used by
            # _save_aggregate.
            group_metadata = dict(materialized_metadata)
            group_metadata["last_received"] = received_timestamp
            kwargs = {
                "platform": platform,
                "message": event.search_message,
                "culprit": culprit,
                "logger": logger_name,
                "level": LOG_LEVELS_MAP.get(level),
                "last_seen": date,
                "first_seen": date,
                "active_at": date,
                "data": group_metadata,
            }

            if release:
                kwargs["first_release"] = release

            try:
                group, is_new, is_regression = self._save_aggregate(
                    event=event, hashes=hashes, release=release, **kwargs)
            except HashDiscarded:
                event_discarded.send_robust(project=project,
                                            sender=EventManager)

                metrics.incr(
                    "events.discarded",
                    skip_internal=True,
                    tags={
                        "organization_id": project.organization_id,
                        "platform": platform
                    },
                )
                raise
            else:
                event_saved.send_robust(project=project,
                                        event_size=event.size,
                                        sender=EventManager)
            event.group = group
        else:
            group = None
            is_new = False
            is_regression = False
            event_saved.send_robust(project=project,
                                    event_size=event.size,
                                    sender=EventManager)

        # store a reference to the group id to guarantee validation of isolation
        event.data.bind_ref(event)

        environment = Environment.get_or_create(project=project,
                                                name=environment)

        if group:
            group_environment, is_new_group_environment = GroupEnvironment.get_or_create(
                group_id=group.id,
                environment_id=environment.id,
                defaults={"first_release": release if release else None},
            )
        else:
            is_new_group_environment = False

        if release:
            ReleaseEnvironment.get_or_create(project=project,
                                             release=release,
                                             environment=environment,
                                             datetime=date)

            ReleaseProjectEnvironment.get_or_create(project=project,
                                                    release=release,
                                                    environment=environment,
                                                    datetime=date)

            if group:
                grouprelease = GroupRelease.get_or_create(
                    group=group,
                    release=release,
                    environment=environment,
                    datetime=date)

        counters = [(tsdb.models.project, project.id)]

        if group:
            counters.append((tsdb.models.group, group.id))

        if release:
            counters.append((tsdb.models.release, release.id))

        tsdb.incr_multi(counters,
                        timestamp=event.datetime,
                        environment_id=environment.id)

        frequencies = []

        if group:
            frequencies.append((tsdb.models.frequent_environments_by_group, {
                group.id: {
                    environment.id: 1
                }
            }))

            if release:
                frequencies.append((tsdb.models.frequent_releases_by_group, {
                    group.id: {
                        grouprelease.id: 1
                    }
                }))
        if frequencies:
            tsdb.record_frequency_multi(frequencies, timestamp=event.datetime)

        if group:
            UserReport.objects.filter(project=project,
                                      event_id=event_id).update(
                                          group=group, environment=environment)

        # Enusre the _metrics key exists. This is usually created during
        # and prefilled with ingestion sizes.
        event_metrics = event.data.get("_metrics") or {}
        event.data["_metrics"] = event_metrics

        # Capture the actual size that goes into node store.
        event_metrics["bytes.stored.event"] = len(
            json.dumps(dict(event.data.items())))

        # Load attachments first, but persist them at the very last after
        # posting to eventstream to make sure all counters and eventstream are
        # incremented for sure.
        attachments = self.get_attachments(cache_key, event)
        for attachment in attachments:
            key = "bytes.stored.%s" % (attachment.type, )
            event_metrics[key] = (event_metrics.get(key) or 0) + len(
                attachment.data)

        # Write the event to Nodestore
        event.data.save()

        if event_user:
            counters = [(tsdb.models.users_affected_by_project, project.id,
                         (event_user.tag_value, ))]

            if group:
                counters.append((tsdb.models.users_affected_by_group, group.id,
                                 (event_user.tag_value, )))

            tsdb.record_multi(counters,
                              timestamp=event.datetime,
                              environment_id=environment.id)

        if release:
            if is_new:
                buffer.incr(
                    ReleaseProject,
                    {"new_groups": 1},
                    {
                        "release_id": release.id,
                        "project_id": project.id
                    },
                )
            if is_new_group_environment:
                buffer.incr(
                    ReleaseProjectEnvironment,
                    {"new_issues_count": 1},
                    {
                        "project_id": project.id,
                        "release_id": release.id,
                        "environment_id": environment.id,
                    },
                )

        if not raw:
            if not project.first_event:
                project.update(first_event=date)
                first_event_received.send_robust(project=project,
                                                 event=event,
                                                 sender=Project)

        eventstream.insert(
            group=group,
            event=event,
            is_new=is_new,
            is_regression=is_regression,
            is_new_group_environment=is_new_group_environment,
            primary_hash=hashes[0],
            # We are choosing to skip consuming the event back
            # in the eventstream if it's flagged as raw.
            # This means that we want to publish the event
            # through the event stream, but we don't care
            # about post processing and handling the commit.
            skip_consume=raw,
        )

        # Do this last to ensure signals get emitted even if connection to the
        # file store breaks temporarily.
        self.save_attachments(attachments, event)

        metric_tags = {"from_relay": "_relay_processed" in self._data}

        metrics.timing("events.latency",
                       received_timestamp - recorded_timestamp,
                       tags=metric_tags)
        metrics.timing("events.size.data.post_save",
                       event.size,
                       tags=metric_tags)
        metrics.incr(
            "events.post_save.normalize.errors",
            amount=len(self._data.get("errors") or ()),
            tags=metric_tags,
        )

        return event
예제 #53
0
파일: snuba.py 프로젝트: supershadok/sentry
def timer(name, prefix="snuba.client"):
    t = time.time()
    try:
        yield
    finally:
        metrics.timing(u"{}.{}".format(prefix, name), time.time() - t)
예제 #54
0
def save_event(cache_key=None,
               data=None,
               start_time=None,
               event_id=None,
               project_id=None,
               **kwargs):
    """
    Saves an event to the database.
    """
    from sentry.event_manager import HashDiscarded, EventManager
    from sentry import quotas, tsdb
    from sentry.models import ProjectKey

    if cache_key:
        data = default_cache.get(cache_key)

    if data is not None:
        data = CanonicalKeyDict(data)

    if event_id is None and data is not None:
        event_id = data['event_id']

    # only when we come from reprocessing we get a project_id sent into
    # the task.
    if project_id is None:
        project_id = data.pop('project')

    delete_raw_event(project_id, event_id, allow_hint_clear=True)

    # This covers two cases: where data is None because we did not manage
    # to fetch it from the default cache or the empty dictionary was
    # stored in the default cache.  The former happens if the event
    # expired while being on the queue, the second happens on reprocessing
    # if the raw event was deleted concurrently while we held on to
    # it.  This causes the node store to delete the data and we end up
    # fetching an empty dict.  We could in theory not invoke `save_event`
    # in those cases but it's important that we always clean up the
    # reprocessing reports correctly or they will screw up the UI.  So
    # to future proof this correctly we just handle this case here.
    if not data:
        metrics.incr('events.failed',
                     tags={
                         'reason': 'cache',
                         'stage': 'post'
                     })
        return

    Raven.tags_context({
        'project': project_id,
    })

    try:
        manager = EventManager(data)
        event = manager.save(project_id)

        # Always load attachments from the cache so we can later prune them.
        # Only save them if the event-attachments feature is active, though.
        if features.has('organizations:event-attachments',
                        event.project.organization,
                        actor=None):
            attachments = attachment_cache.get(cache_key) or []
            for attachment in attachments:
                save_attachment(event, attachment)

    except HashDiscarded:
        increment_list = [
            (tsdb.models.project_total_received_discarded, project_id),
        ]

        try:
            project = Project.objects.get_from_cache(id=project_id)
        except Project.DoesNotExist:
            pass
        else:
            increment_list.extend([
                (tsdb.models.project_total_blacklisted, project.id),
                (tsdb.models.organization_total_blacklisted,
                 project.organization_id),
            ])

            project_key = None
            if data.get('key_id') is not None:
                try:
                    project_key = ProjectKey.objects.get_from_cache(
                        id=data['key_id'])
                except ProjectKey.DoesNotExist:
                    pass
                else:
                    increment_list.append(
                        (tsdb.models.key_total_blacklisted, project_key.id))

            quotas.refund(
                project,
                key=project_key,
                timestamp=start_time,
            )

        tsdb.incr_multi(
            increment_list,
            timestamp=to_datetime(start_time)
            if start_time is not None else None,
        )

    finally:
        if cache_key:
            default_cache.delete(cache_key)
            attachment_cache.delete(cache_key)

        if start_time:
            metrics.timing('events.time-to-process',
                           time() - start_time,
                           instance=data['platform'])
예제 #55
0
def timer(name, prefix='snuba.client'):
    t = time.time()
    try:
        yield
    finally:
        metrics.timing(u'{}.{}'.format(prefix, name), time.time() - t)
예제 #56
0
def cleanup(days, project, concurrency, silent, model, router, timed):
    """Delete a portion of trailing data based on creation date.

    All data that is older than `--days` will be deleted.  The default for
    this is 30 days.  In the default setting all projects will be truncated
    but if you have a specific project you want to limit this to this can be
    done with the `--project` flag which accepts a project ID or a string
    with the form `org/project` where both are slugs.
    """
    if concurrency < 1:
        click.echo('Error: Minimum concurrency is 1', err=True)
        raise click.Abort()

    os.environ['_SENTRY_CLEANUP'] = '1'

    # Make sure we fork off multiprocessing pool
    # before we import or configure the app
    from multiprocessing import Process, JoinableQueue as Queue

    pool = []
    task_queue = Queue(1000)
    for _ in xrange(concurrency):
        p = Process(target=multiprocess_worker, args=(task_queue, ))
        p.daemon = True
        p.start()
        pool.append(p)

    from sentry.runner import configure
    configure()

    from django.db import router as db_router
    from sentry.app import nodestore
    from sentry.db.deletion import BulkDeleteQuery
    from sentry import models

    if timed:
        import time
        from sentry.utils import metrics
        start_time = time.time()

    # list of models which this query is restricted to
    model_list = {m.lower() for m in model}

    def is_filtered(model):
        if router is not None and db_router.db_for_write(model) != router:
            return True
        if not model_list:
            return False
        return model.__name__.lower() not in model_list

    # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations)
    # (model, datetime_field, order_by)
    BULK_QUERY_DELETES = [
        (models.EventMapping, 'date_added', '-date_added'),
        (models.EventAttachment, 'date_added', None),
        (models.UserReport, 'date_added', None),
        (models.GroupEmailThread, 'date', None),
        (models.GroupRuleStatus, 'date_added', None),
    ] + EXTRA_BULK_QUERY_DELETES

    # Deletions that use the `deletions` code path (which handles their child relations)
    # (model, datetime_field, order_by)
    DELETES = (
        (models.Event, 'datetime', 'datetime'),
        (models.Group, 'last_seen', 'last_seen'),
    )

    if not silent:
        click.echo('Removing expired values for LostPasswordHash')

    if is_filtered(models.LostPasswordHash):
        if not silent:
            click.echo('>> Skipping LostPasswordHash')
    else:
        models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() -
                                               timedelta(hours=48)).delete()

    if is_filtered(models.OrganizationMember) and not silent:
        click.echo('>> Skipping OrganizationMember')
    else:
        if not silent:
            click.echo('Removing expired values for OrganizationMember')
        expired_threshold = timezone.now() - timedelta(days=days)
        models.OrganizationMember.delete_expired(expired_threshold)

    for model in [models.ApiGrant, models.ApiToken]:
        if not silent:
            click.echo(u'Removing expired values for {}'.format(
                model.__name__))

        if is_filtered(model):
            if not silent:
                click.echo(u'>> Skipping {}'.format(model.__name__))
        else:
            model.objects.filter(expires_at__lt=(
                timezone.now() -
                timedelta(days=API_TOKEN_TTL_IN_DAYS)), ).delete()

    project_id = None
    if project:
        click.echo(
            "Bulk NodeStore deletion not available for project selection",
            err=True)
        project_id = get_project(project)
        if project_id is None:
            click.echo('Error: Project not found', err=True)
            raise click.Abort()
    else:
        if not silent:
            click.echo("Removing old NodeStore values")

        cutoff = timezone.now() - timedelta(days=days)
        try:
            nodestore.cleanup(cutoff)
        except NotImplementedError:
            click.echo("NodeStore backend does not support cleanup operation",
                       err=True)

    for bqd in BULK_QUERY_DELETES:
        if len(bqd) == 4:
            model, dtfield, order_by, chunk_size = bqd
        else:
            chunk_size = 10000
            model, dtfield, order_by = bqd

        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))
        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            ).execute(chunk_size=chunk_size)

    for model, dtfield, order_by in DELETES:
        if not silent:
            click.echo(
                u"Removing {model} for days={days} project={project}".format(
                    model=model.__name__,
                    days=days,
                    project=project or '*',
                ))

        if is_filtered(model):
            if not silent:
                click.echo('>> Skipping %s' % model.__name__)
        else:
            imp = '.'.join((model.__module__, model.__name__))

            q = BulkDeleteQuery(
                model=model,
                dtfield=dtfield,
                days=days,
                project_id=project_id,
                order_by=order_by,
            )

            for chunk in q.iterator(chunk_size=100):
                task_queue.put((imp, chunk))

            task_queue.join()

    # Clean up FileBlob instances which are no longer used and aren't super
    # recent (as there could be a race between blob creation and reference)
    if not silent:
        click.echo("Cleaning up unused FileBlob references")
    if is_filtered(models.FileBlob):
        if not silent:
            click.echo('>> Skipping FileBlob')
    else:
        cleanup_unused_files(silent)

    # Shut down our pool
    for _ in pool:
        task_queue.put(_STOP_WORKER)

    # And wait for it to drain
    for p in pool:
        p.join()

    if timed:
        duration = int(time.time() - start_time)
        metrics.timing('cleanup.duration',
                       duration,
                       instance=router,
                       sample_rate=1.0)
        click.echo("Clean up took %s second(s)." % duration)
예제 #57
0
    def set_commits(self, commit_list):
        """
        Bind a list of commits to this release.

        This will clear any existing commit log and replace it with the given
        commits.
        """

        # Sort commit list in reverse order
        commit_list.sort(key=lambda commit: commit.get('timestamp'),
                         reverse=True)

        # TODO(dcramer): this function could use some cleanup/refactoring as its a bit unwieldly
        from sentry.models import (Commit, CommitAuthor, Group, GroupLink,
                                   GroupResolution, GroupStatus, ReleaseCommit,
                                   ReleaseHeadCommit, Repository, PullRequest)
        from sentry.plugins.providers.repository import RepositoryProvider
        from sentry.tasks.integrations import kick_off_status_syncs
        # todo(meredith): implement for IntegrationRepositoryProvider
        commit_list = [
            c for c in commit_list if
            not RepositoryProvider.should_ignore_commit(c.get('message', ''))
        ]
        lock_key = type(self).get_lock_key(self.organization_id, self.id)
        lock = locks.get(lock_key, duration=10)
        with TimedRetryPolicy(10)(lock.acquire):
            start = time()
            with transaction.atomic():
                # TODO(dcramer): would be good to optimize the logic to avoid these
                # deletes but not overly important
                ReleaseCommit.objects.filter(release=self, ).delete()

                authors = {}
                repos = {}
                commit_author_by_commit = {}
                head_commit_by_repo = {}
                latest_commit = None
                for idx, data in enumerate(commit_list):
                    repo_name = data.get(
                        'repository') or u'organization-{}'.format(
                            self.organization_id)
                    if repo_name not in repos:
                        repos[
                            repo_name] = repo = Repository.objects.get_or_create(
                                organization_id=self.organization_id,
                                name=repo_name,
                            )[0]
                    else:
                        repo = repos[repo_name]

                    author_email = data.get('author_email')
                    if author_email is None and data.get('author_name'):
                        author_email = (re.sub(r'[^a-zA-Z0-9\-_\.]*', '',
                                               data['author_name']).lower() +
                                        '@localhost')

                    if not author_email:
                        author = None
                    elif author_email not in authors:
                        author_data = {'name': data.get('author_name')}
                        author, created = CommitAuthor.objects.create_or_update(
                            organization_id=self.organization_id,
                            email=author_email,
                            values=author_data)
                        if not created:
                            author = CommitAuthor.objects.get(
                                organization_id=self.organization_id,
                                email=author_email)
                        authors[author_email] = author
                    else:
                        author = authors[author_email]

                    commit_data = {}
                    defaults = {}

                    # Update/set message and author if they are provided.
                    if author is not None:
                        commit_data['author'] = author
                    if 'message' in data:
                        commit_data['message'] = data['message']
                    if 'timestamp' in data:
                        commit_data['date_added'] = data['timestamp']
                    else:
                        defaults['date_added'] = timezone.now()

                    commit, created = Commit.objects.create_or_update(
                        organization_id=self.organization_id,
                        repository_id=repo.id,
                        key=data['id'],
                        defaults=defaults,
                        values=commit_data)
                    if not created:
                        commit = Commit.objects.get(
                            organization_id=self.organization_id,
                            repository_id=repo.id,
                            key=data['id'])

                    if author is None:
                        author = commit.author

                    commit_author_by_commit[commit.id] = author

                    patch_set = data.get('patch_set', [])
                    for patched_file in patch_set:
                        try:
                            with transaction.atomic():
                                CommitFileChange.objects.create(
                                    organization_id=self.organization.id,
                                    commit=commit,
                                    filename=patched_file['path'],
                                    type=patched_file['type'],
                                )
                        except IntegrityError:
                            pass

                    try:
                        with transaction.atomic():
                            ReleaseCommit.objects.create(
                                organization_id=self.organization_id,
                                release=self,
                                commit=commit,
                                order=idx,
                            )
                    except IntegrityError:
                        pass

                    if latest_commit is None:
                        latest_commit = commit

                    head_commit_by_repo.setdefault(repo.id, commit.id)

                self.update(
                    commit_count=len(commit_list),
                    authors=[
                        six.text_type(a_id)
                        for a_id in ReleaseCommit.objects.filter(
                            release=self,
                            commit__author_id__isnull=False,
                        ).values_list('commit__author_id',
                                      flat=True).distinct()
                    ],
                    last_commit_id=latest_commit.id if latest_commit else None,
                )
                metrics.timing('release.set_commits.duration', time() - start)

        # fill any missing ReleaseHeadCommit entries
        for repo_id, commit_id in six.iteritems(head_commit_by_repo):
            try:
                with transaction.atomic():
                    ReleaseHeadCommit.objects.create(
                        organization_id=self.organization_id,
                        release_id=self.id,
                        repository_id=repo_id,
                        commit_id=commit_id,
                    )
            except IntegrityError:
                pass

        release_commits = list(
            ReleaseCommit.objects.filter(
                release=self).select_related('commit').values(
                    'commit_id', 'commit__key'))

        commit_resolutions = list(
            GroupLink.objects.filter(
                linked_type=GroupLink.LinkedType.commit,
                linked_id__in=[rc['commit_id'] for rc in release_commits],
            ).values_list('group_id', 'linked_id'))

        commit_group_authors = [
            (
                cr[0],  # group_id
                commit_author_by_commit.get(cr[1]))
            for cr in commit_resolutions
        ]

        pr_ids_by_merge_commit = list(
            PullRequest.objects.filter(
                merge_commit_sha__in=[
                    rc['commit__key'] for rc in release_commits
                ],
                organization_id=self.organization_id,
            ).values_list('id', flat=True))

        pull_request_resolutions = list(
            GroupLink.objects.filter(
                relationship=GroupLink.Relationship.resolves,
                linked_type=GroupLink.LinkedType.pull_request,
                linked_id__in=pr_ids_by_merge_commit,
            ).values_list('group_id', 'linked_id'))

        pr_authors = list(
            PullRequest.objects.filter(id__in=[
                prr[1] for prr in pull_request_resolutions
            ], ).select_related('author'))

        pr_authors_dict = {pra.id: pra.author for pra in pr_authors}

        pull_request_group_authors = [(prr[0], pr_authors_dict.get(prr[1]))
                                      for prr in pull_request_resolutions]

        user_by_author = {None: None}

        commits_and_prs = list(
            itertools.chain(commit_group_authors,
                            pull_request_group_authors), )

        group_project_lookup = dict(
            Group.objects.filter(id__in=[
                group_id for group_id, _ in commits_and_prs
            ], ).values_list('id', 'project_id'))

        for group_id, author in commits_and_prs:
            if author not in user_by_author:
                try:
                    user_by_author[author] = author.find_users()[0]
                except IndexError:
                    user_by_author[author] = None
            actor = user_by_author[author]

            with transaction.atomic():
                GroupResolution.objects.create_or_update(
                    group_id=group_id,
                    values={
                        'release': self,
                        'type': GroupResolution.Type.in_release,
                        'status': GroupResolution.Status.resolved,
                        'actor_id': actor.id if actor else None,
                    },
                )
                group = Group.objects.get(id=group_id, )
                group.update(status=GroupStatus.RESOLVED)
                metrics.incr('group.resolved',
                             instance='in_commit',
                             skip_internal=True)

            issue_resolved.send_robust(
                organization_id=self.organization_id,
                user=actor,
                group=group,
                project=group.project,
                resolution_type='with_commit',
                sender=type(self),
            )

            kick_off_status_syncs.apply_async(
                kwargs={
                    'project_id': group_project_lookup[group_id],
                    'group_id': group_id,
                })
예제 #58
0
    def save(self, project, raw=False):
        from sentry.tasks.post_process import index_event_tags
        data = self.data

        project = Project.objects.get_from_cache(id=project)

        # Check to make sure we're not about to do a bunch of work that's
        # already been done if we've processed an event with this ID. (This
        # isn't a perfect solution -- this doesn't handle ``EventMapping`` and
        # there's a race condition between here and when the event is actually
        # saved, but it's an improvement. See GH-7677.)
        try:
            event = Event.objects.get(
                project_id=project.id,
                event_id=data['event_id'],
            )
        except Event.DoesNotExist:
            pass
        else:
            self.logger.info('duplicate.found',
                             exc_info=True,
                             extra={
                                 'event_uuid': data['event_id'],
                                 'project_id': project.id,
                                 'model': Event.__name__,
                             })
            return event

        # First we pull out our top-level (non-data attr) kwargs
        event_id = data.pop('event_id')
        level = data.pop('level')
        culprit = data.pop('transaction', None)
        if not culprit:
            culprit = data.pop('culprit', None)
        logger_name = data.pop('logger', None)
        server_name = data.pop('server_name', None)
        site = data.pop('site', None)
        checksum = data.pop('checksum', None)
        fingerprint = data.pop('fingerprint', None)
        platform = data.pop('platform', None)
        release = data.pop('release', None)
        dist = data.pop('dist', None)
        environment = data.pop('environment', None)

        # unused
        time_spent = data.pop('time_spent', None)
        message = data.pop('message', '')

        if not culprit:
            # if we generate an implicit culprit, lets not call it a
            # transaction
            transaction_name = None
            culprit = generate_culprit(data, platform=platform)
        else:
            transaction_name = culprit

        culprit = force_text(culprit)

        recorded_timestamp = data.pop('timestamp')
        date = datetime.fromtimestamp(recorded_timestamp)
        date = date.replace(tzinfo=timezone.utc)

        kwargs = {
            'platform': platform,
        }

        event = Event(project_id=project.id,
                      event_id=event_id,
                      data=data,
                      time_spent=time_spent,
                      datetime=date,
                      **kwargs)
        event._project_cache = project

        # convert this to a dict to ensure we're only storing one value per key
        # as most parts of Sentry dont currently play well with multiple values
        tags = dict(data.get('tags') or [])
        tags['level'] = LOG_LEVELS[level]
        if logger_name:
            tags['logger'] = logger_name
        if server_name:
            tags['server_name'] = server_name
        if site:
            tags['site'] = site
        if environment:
            tags['environment'] = environment
        if transaction_name:
            tags['transaction'] = transaction_name

        if release:
            # dont allow a conflicting 'release' tag
            if 'release' in tags:
                del tags['release']
            release = Release.get_or_create(
                project=project,
                version=release,
                date_added=date,
            )

            tags['sentry:release'] = release.version

        if dist and release:
            dist = release.add_dist(dist, date)
            tags['sentry:dist'] = dist.name
        else:
            dist = None

        event_user = self._get_event_user(project, data)
        if event_user:
            # dont allow a conflicting 'user' tag
            if 'user' in tags:
                del tags['user']
            tags['sentry:user'] = event_user.tag_value

        # At this point we want to normalize the in_app values in case the
        # clients did not set this appropriately so far.
        normalize_in_app(data)

        for plugin in plugins.for_project(project, version=None):
            added_tags = safe_execute(plugin.get_tags,
                                      event,
                                      _with_transaction=False)
            if added_tags:
                # plugins should not override user provided tags
                for key, value in added_tags:
                    tags.setdefault(key, value)

        for path, iface in six.iteritems(event.interfaces):
            for k, v in iface.iter_tags():
                tags[k] = v
            # Get rid of ephemeral interface data
            if iface.ephemeral:
                data.pop(iface.get_path(), None)

        # tags are stored as a tuple
        tags = tags.items()

        data['tags'] = tags
        data['fingerprint'] = fingerprint or ['{{ default }}']

        # prioritize fingerprint over checksum as its likely the client defaulted
        # a checksum whereas the fingerprint was explicit
        if fingerprint:
            hashes = [
                md5_from_hash(h)
                for h in get_hashes_from_fingerprint(event, fingerprint)
            ]
        elif checksum:
            if HASH_RE.match(checksum):
                hashes = [checksum]
            else:
                hashes = [md5_from_hash([checksum]), checksum]
            data['checksum'] = checksum
        else:
            hashes = [md5_from_hash(h) for h in get_hashes_for_event(event)]

        # TODO(dcramer): temp workaround for complexity
        data['message'] = message
        event_type = eventtypes.get(data.get('type', 'default'))(data)
        event_metadata = event_type.get_metadata()
        # TODO(dcramer): temp workaround for complexity
        del data['message']

        data['type'] = event_type.key
        data['metadata'] = event_metadata

        # index components into ``Event.message``
        # See GH-3248
        if event_type.key != 'default':
            if 'sentry.interfaces.Message' in data and \
                    data['sentry.interfaces.Message']['message'] != message:
                message = u'{} {}'.format(
                    message,
                    data['sentry.interfaces.Message']['message'],
                )

        if not message:
            message = ''
        elif not isinstance(message, six.string_types):
            message = force_text(message)

        for value in six.itervalues(event_metadata):
            value_u = force_text(value, errors='replace')
            if value_u not in message:
                message = u'{} {}'.format(message, value_u)

        if culprit and culprit not in message:
            culprit_u = force_text(culprit, errors='replace')
            message = u'{} {}'.format(message, culprit_u)

        message = trim(message.strip(), settings.SENTRY_MAX_MESSAGE_LENGTH)

        event.message = message
        kwargs['message'] = message

        received_timestamp = event.data.get('received') or float(
            event.datetime.strftime('%s'))
        group_kwargs = kwargs.copy()
        group_kwargs.update({
            'culprit': culprit,
            'logger': logger_name,
            'level': level,
            'last_seen': date,
            'first_seen': date,
            'active_at': date,
            'data': {
                'last_received': received_timestamp,
                'type': event_type.key,
                # we cache the events metadata on the group to ensure its
                # accessible in the stream
                'metadata': event_metadata,
            },
        })

        if release:
            group_kwargs['first_release'] = release

        try:
            group, is_new, is_regression, is_sample = self._save_aggregate(
                event=event, hashes=hashes, release=release, **group_kwargs)
        except HashDiscarded:
            event_discarded.send_robust(
                project=project,
                sender=EventManager,
            )

            metrics.incr(
                'events.discarded',
                skip_internal=True,
                tags={
                    'organization_id': project.organization_id,
                    'platform': platform,
                },
            )
            raise
        else:
            event_saved.send_robust(
                project=project,
                sender=EventManager,
            )

        event.group = group
        # store a reference to the group id to guarantee validation of isolation
        event.data.bind_ref(event)

        # When an event was sampled, the canonical source of truth
        # is the EventMapping table since we aren't going to be writing out an actual
        # Event row. Otherwise, if the Event isn't being sampled, we can safely
        # rely on the Event table itself as the source of truth and ignore
        # EventMapping since it's redundant information.
        if is_sample:
            try:
                with transaction.atomic(
                        using=router.db_for_write(EventMapping)):
                    EventMapping.objects.create(project=project,
                                                group=group,
                                                event_id=event_id)
            except IntegrityError:
                self.logger.info('duplicate.found',
                                 exc_info=True,
                                 extra={
                                     'event_uuid': event_id,
                                     'project_id': project.id,
                                     'group_id': group.id,
                                     'model': EventMapping.__name__,
                                 })
                return event

        environment = Environment.get_or_create(
            project=project,
            name=environment,
        )

        group_environment, is_new_group_environment = GroupEnvironment.get_or_create(
            group_id=group.id,
            environment_id=environment.id,
            defaults={
                'first_release_id': release.id if release else None,
            },
        )

        if release:
            ReleaseEnvironment.get_or_create(
                project=project,
                release=release,
                environment=environment,
                datetime=date,
            )

            ReleaseProjectEnvironment.get_or_create(
                project=project,
                release=release,
                environment=environment,
                datetime=date,
            )

            grouprelease = GroupRelease.get_or_create(
                group=group,
                release=release,
                environment=environment,
                datetime=date,
            )

        counters = [
            (tsdb.models.group, group.id),
            (tsdb.models.project, project.id),
        ]

        if release:
            counters.append((tsdb.models.release, release.id))

        tsdb.incr_multi(counters,
                        timestamp=event.datetime,
                        environment_id=environment.id)

        frequencies = [
            # (tsdb.models.frequent_projects_by_organization, {
            #     project.organization_id: {
            #         project.id: 1,
            #     },
            # }),
            # (tsdb.models.frequent_issues_by_project, {
            #     project.id: {
            #         group.id: 1,
            #     },
            # })
            (tsdb.models.frequent_environments_by_group, {
                group.id: {
                    environment.id: 1,
                },
            })
        ]

        if release:
            frequencies.append((tsdb.models.frequent_releases_by_group, {
                group.id: {
                    grouprelease.id: 1,
                },
            }))

        tsdb.record_frequency_multi(frequencies, timestamp=event.datetime)

        UserReport.objects.filter(
            project=project,
            event_id=event_id,
        ).update(
            group=group,
            environment=environment,
        )

        # save the event unless its been sampled
        if not is_sample:
            try:
                with transaction.atomic(using=router.db_for_write(Event)):
                    event.save()
            except IntegrityError:
                self.logger.info('duplicate.found',
                                 exc_info=True,
                                 extra={
                                     'event_uuid': event_id,
                                     'project_id': project.id,
                                     'group_id': group.id,
                                     'model': Event.__name__,
                                 })
                return event

            index_event_tags.delay(
                organization_id=project.organization_id,
                project_id=project.id,
                group_id=group.id,
                environment_id=environment.id,
                event_id=event.id,
                tags=tags,
                date_added=event.datetime,
            )

        if event_user:
            tsdb.record_multi(
                (
                    (tsdb.models.users_affected_by_group, group.id,
                     (event_user.tag_value, )),
                    (tsdb.models.users_affected_by_project, project.id,
                     (event_user.tag_value, )),
                ),
                timestamp=event.datetime,
                environment_id=environment.id,
            )
        if release:
            if is_new:
                buffer.incr(ReleaseProject, {'new_groups': 1}, {
                    'release_id': release.id,
                    'project_id': project.id,
                })
            if is_new_group_environment:
                buffer.incr(ReleaseProjectEnvironment, {'new_issues_count': 1},
                            {
                                'project_id': project.id,
                                'release_id': release.id,
                                'environment_id': environment.id,
                            })

        safe_execute(Group.objects.add_tags,
                     group,
                     environment,
                     tags,
                     _with_transaction=False)

        if not raw:
            if not project.first_event:
                project.update(first_event=date)
                first_event_received.send(project=project,
                                          group=group,
                                          sender=Project)

            post_process_group.delay(
                group=group,
                event=event,
                is_new=is_new,
                is_sample=is_sample,
                is_regression=is_regression,
                is_new_group_environment=is_new_group_environment,
                primary_hash=hashes[0],
            )
        else:
            self.logger.info('post_process.skip.raw_event',
                             extra={'event_id': event.id})

        metrics.timing(
            'events.latency',
            received_timestamp - recorded_timestamp,
            tags={
                'project_id': project.id,
            },
        )

        return event
예제 #59
0
파일: snuba.py 프로젝트: binlee1990/sentry
def timer(name, prefix='snuba.client'):
    t = time.time()
    try:
        yield
    finally:
        metrics.timing('{}.{}'.format(prefix, name), time.time() - t)
예제 #60
0
    def query(
        self,
        projects,
        retention_window_start,
        group_queryset,
        environments,
        sort_by,
        limit,
        cursor,
        count_hits,
        paginator_options,
        search_filters,
        date_from,
        date_to,
        max_hits=None,
    ):

        now = timezone.now()
        end = None
        end_params = [
            _f for _f in
            [date_to, get_search_filter(search_filters, "date", "<")] if _f
        ]
        if end_params:
            end = min(end_params)

        if not end:
            end = now + ALLOWED_FUTURE_DELTA

            metrics.incr("snuba.search.postgres_only")

            # This search is for some time window that ends with "now",
            # so if the requested sort is `date` (`last_seen`) and there
            # are no other Snuba-based search predicates, we can simply
            # return the results from Postgres.
            if (cursor is None and sort_by == "date" and
                    # This handles tags and date parameters for search filters.
                    not [
                        sf for sf in search_filters if sf.key.name not in
                        self.postgres_only_fields.union(["date"])
                    ]):
                group_queryset = group_queryset.order_by("-last_seen")
                paginator = DateTimePaginator(group_queryset, "-last_seen",
                                              **paginator_options)
                # When its a simple django-only search, we count_hits like normal
                return paginator.get_result(limit,
                                            cursor,
                                            count_hits=count_hits,
                                            max_hits=max_hits)

        # TODO: Presumably we only want to search back to the project's max
        # retention date, which may be closer than 90 days in the past, but
        # apparently `retention_window_start` can be None(?), so we need a
        # fallback.
        retention_date = max(
            _f for _f in [retention_window_start, now - timedelta(days=90)]
            if _f)
        start_params = [
            date_from, retention_date,
            get_search_filter(search_filters, "date", ">")
        ]
        start = max(_f for _f in start_params if _f)
        end = max([retention_date, end])

        if start == retention_date and end == retention_date:
            # Both `start` and `end` must have been trimmed to `retention_date`,
            # so this entire search was against a time range that is outside of
            # retention. We'll return empty results to maintain backwards compatibility
            # with Django search (for now).
            return self.empty_result

        if start >= end:
            # TODO: This maintains backwards compatibility with Django search, but
            # in the future we should find a way to notify the user that their search
            # is invalid.
            return self.empty_result

        # Here we check if all the django filters reduce the set of groups down
        # to something that we can send down to Snuba in a `group_id IN (...)`
        # clause.
        max_candidates = options.get("snuba.search.max-pre-snuba-candidates")

        with sentry_sdk.start_span(op="snuba_group_query") as span:
            group_ids = list(
                group_queryset.values_list("id",
                                           flat=True)[:max_candidates + 1])
            span.set_data("Max Candidates", max_candidates)
            span.set_data("Result Size", len(group_ids))
        metrics.timing("snuba.search.num_candidates", len(group_ids))

        too_many_candidates = False
        if not group_ids:
            # no matches could possibly be found from this point on
            metrics.incr("snuba.search.no_candidates", skip_internal=False)
            return self.empty_result
        elif len(group_ids) > max_candidates:
            # If the pre-filter query didn't include anything to significantly
            # filter down the number of results (from 'first_release', 'status',
            # 'bookmarked_by', 'assigned_to', 'unassigned', or 'subscribed_by')
            # then it might have surpassed the `max_candidates`. In this case,
            # we *don't* want to pass candidates down to Snuba, and instead we
            # want Snuba to do all the filtering/sorting it can and *then* apply
            # this queryset to the results from Snuba, which we call
            # post-filtering.
            metrics.incr("snuba.search.too_many_candidates",
                         skip_internal=False)
            too_many_candidates = True
            group_ids = []

        sort_field = self.sort_strategies[sort_by]
        chunk_growth = options.get("snuba.search.chunk-growth-rate")
        max_chunk_size = options.get("snuba.search.max-chunk-size")
        chunk_limit = limit
        offset = 0
        num_chunks = 0
        hits = self.calculate_hits(
            group_ids,
            too_many_candidates,
            sort_field,
            projects,
            retention_window_start,
            group_queryset,
            environments,
            sort_by,
            limit,
            cursor,
            count_hits,
            paginator_options,
            search_filters,
            start,
            end,
        )
        if count_hits and hits == 0:
            return self.empty_result

        paginator_results = self.empty_result
        result_groups = []
        result_group_ids = set()

        max_time = options.get("snuba.search.max-total-chunk-time-seconds")
        time_start = time.time()

        # Do smaller searches in chunks until we have enough results
        # to answer the query (or hit the end of possible results). We do
        # this because a common case for search is to return 100 groups
        # sorted by `last_seen`, and we want to avoid returning all of
        # a project's groups and then post-sorting them all in Postgres
        # when typically the first N results will do.
        while (time.time() - time_start) < max_time:
            num_chunks += 1

            # grow the chunk size on each iteration to account for huge projects
            # and weird queries, up to a max size
            chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size)
            # but if we have group_ids always query for at least that many items
            chunk_limit = max(chunk_limit, len(group_ids))

            # {group_id: group_score, ...}
            snuba_groups, total = self.snuba_search(
                start=start,
                end=end,
                project_ids=[p.id for p in projects],
                environment_ids=environments
                and [environment.id for environment in environments],
                organization_id=projects[0].organization_id,
                sort_field=sort_field,
                cursor=cursor,
                group_ids=group_ids,
                limit=chunk_limit,
                offset=offset,
                search_filters=search_filters,
            )
            metrics.timing("snuba.search.num_snuba_results", len(snuba_groups))
            count = len(snuba_groups)
            more_results = count >= limit and (offset + limit) < total
            offset += len(snuba_groups)

            if not snuba_groups:
                break

            if group_ids:
                # pre-filtered candidates were passed down to Snuba, so we're
                # finished with filtering and these are the only results. Note
                # that because we set the chunk size to at least the size of
                # the group_ids, we know we got all of them (ie there are
                # no more chunks after the first)
                result_groups = snuba_groups
                if count_hits and hits is None:
                    hits = len(snuba_groups)
            else:
                # pre-filtered candidates were *not* passed down to Snuba,
                # so we need to do post-filtering to verify Sentry DB predicates
                filtered_group_ids = group_queryset.filter(
                    id__in=[gid
                            for gid, _ in snuba_groups]).values_list("id",
                                                                     flat=True)

                group_to_score = dict(snuba_groups)
                for group_id in filtered_group_ids:
                    if group_id in result_group_ids:
                        # because we're doing multiple Snuba queries, which
                        # happen outside of a transaction, there is a small possibility
                        # of groups moving around in the sort scoring underneath us,
                        # so we at least want to protect against duplicates
                        continue

                    group_score = group_to_score[group_id]
                    result_group_ids.add(group_id)
                    result_groups.append((group_id, group_score))

            # break the query loop for one of three reasons:
            # * we started with Postgres candidates and so only do one Snuba query max
            # * the paginator is returning enough results to satisfy the query (>= the limit)
            # * there are no more groups in Snuba to post-filter
            # TODO do we actually have to rebuild this SequencePaginator every time
            # or can we just make it after we've broken out of the loop?
            paginator_results = SequencePaginator(
                [(score, id) for (id, score) in result_groups],
                reverse=True,
                **paginator_options).get_result(limit,
                                                cursor,
                                                known_hits=hits,
                                                max_hits=max_hits)

            if group_ids or len(
                    paginator_results.results) >= limit or not more_results:
                break

        # HACK: We're using the SequencePaginator to mask the complexities of going
        # back and forth between two databases. This causes a problem with pagination
        # because we're 'lying' to the SequencePaginator (it thinks it has the entire
        # result set in memory when it does not). For this reason we need to make some
        # best guesses as to whether the `prev` and `next` cursors have more results.

        if len(paginator_results.results) == limit and more_results:
            # Because we are going back and forth between DBs there is a small
            # chance that we will hand the SequencePaginator exactly `limit`
            # items. In this case the paginator will assume there are no more
            # results, so we need to override the `next` cursor's results.
            paginator_results.next.has_results = True

        if cursor is not None and (not cursor.is_prev
                                   or len(paginator_results.results) > 0):
            # If the user passed a cursor, and it isn't already a 0 result `is_prev`
            # cursor, then it's worth allowing them to go back a page to check for
            # more results.
            paginator_results.prev.has_results = True

        metrics.timing("snuba.search.num_chunks", num_chunks)

        groups = Group.objects.in_bulk(paginator_results.results)
        paginator_results.results = [
            groups[k] for k in paginator_results.results if k in groups
        ]

        return paginator_results