def save_event(cache_key=None, data=None, start_time=None, event_id=None, **kwargs): """ Saves an event to the database. """ from sentry.event_manager import EventManager if cache_key: data = default_cache.get(cache_key) if event_id is None and data is not None: event_id = data['event_id'] if data is None: metrics.incr('events.failed', tags={'reason': 'cache', 'stage': 'post'}) return project = data.pop('project') delete_raw_event(project, event_id) Raven.tags_context({ 'project': project, }) try: manager = EventManager(data) manager.save(project) finally: if cache_key: default_cache.delete(cache_key) if start_time: metrics.timing('events.time-to-process', time() - start_time, instance=data['platform'])
def process_pending(self): client = self.cluster.get_routing_client() lock_key = self._make_lock_key(self.pending_key) # prevent a stampede due to celerybeat + periodic task if not client.set(lock_key, '1', nx=True, ex=60): return try: keycount = 0 with self.cluster.all() as conn: results = conn.zrange(self.pending_key, 0, -1) with self.cluster.all() as conn: for host_id, keys in six.iteritems(results.value): if not keys: continue keycount += len(keys) for key in keys: process_incr.apply_async(kwargs={ 'key': key, }) conn.target([host_id]).zrem(self.pending_key, *keys) metrics.timing('buffer.pending-size', keycount) finally: client.delete(lock_key)
def __call__(self, function): start = self.clock.time() try: for i in itertools.count(1): try: return function() except self.exceptions as error: delay = self.delay(i) now = self.clock.time() if (now + delay) > (start + self.timeout): raise RetryException( 'Could not successfully execute %r within %.3f seconds (%s attempts.)' % (function, now - start, i), error, ) else: logger.debug( 'Failed to execute %r due to %r on attempt #%s, retrying in %s seconds...', function, error, i, delay, ) self.clock.sleep(delay) finally: if self.metric_instance: metrics.timing( 'timedretrypolicy.duration', self.clock.time() - start, instance=self.metric_instance, tags=self.metric_tags, )
def putfile(self, fileobj, blob_size=DEFAULT_BLOB_SIZE, commit=True): """ Save a fileobj into a number of chunks. Returns a list of `FileBlobIndex` items. >>> indexes = file.putfile(fileobj) """ results = [] offset = 0 checksum = sha1(b'') while True: contents = fileobj.read(blob_size) if not contents: break checksum.update(contents) blob_fileobj = ContentFile(contents) blob = FileBlob.from_file(blob_fileobj) results.append(FileBlobIndex.objects.create( file=self, blob=blob, offset=offset, )) offset += blob.size self.size = offset self.checksum = checksum.hexdigest() metrics.timing('filestore.file-size', offset) if commit: self.save() return results
def try_repeated(func): """ Runs a function a few times ignoring errors we see from GCS due to what appears to be network issues. This is a temporary workaround until we can find the root cause. """ if hasattr(func, '__name__'): func_name = func.__name__ elif hasattr(func, 'func'): # Partials func_name = getattr(func.func, '__name__', '__unknown__') else: func_name = '__unknown__' metrics_key = 'filestore.gcs.retry' metrics_tags = {'function': func_name} idx = 0 while True: try: result = func() metrics_tags.update({'success': '1'}) metrics.timing(metrics_key, idx, tags=metrics_tags) return result except (DataCorruption, TransportError, RefreshError, RequestException, OpenSSLError) as e: if idx >= GCS_RETRIES: metrics_tags.update({'success': '0', 'exception_class': e.__class__.__name__}) metrics.timing(metrics_key, idx, tags=metrics_tags) raise idx += 1
def putfile(self, fileobj, blob_size=DEFAULT_BLOB_SIZE): """ Save a fileobj into a number of chunks. Returns a list of `FileBlobIndex` items. >>> indexes = file.putfile(fileobj) """ results = [] offset = 0 while True: contents = fileobj.read(blob_size) if not contents: break blob_fileobj = ContentFile(contents) blob = FileBlob.from_file(blob_fileobj) results.append( FileBlobIndex.objects.create( file=self, blob=blob, offset=offset, ) ) offset += blob.size metrics.timing('filestore.file-size', offset) return results
def index_event_tags(organization_id, project_id, event_id, tags, group_id, environment_id, date_added=None, **kwargs): from sentry import tagstore with configure_scope() as scope: scope.set_tag("project", project_id) create_event_tags_kwargs = {} if date_added is not None: create_event_tags_kwargs['date_added'] = date_added metrics.timing( 'tagstore.tags_per_event', len(tags), tags={ 'organization_id': organization_id, } ) tagstore.create_event_tags( project_id=project_id, group_id=group_id, environment_id=environment_id, event_id=event_id, tags=tags, **create_event_tags_kwargs )
def process_pending(self): client = self.cluster.get_routing_client() lock_key = self._make_lock_key(self.pending_key) # prevent a stampede due to celerybeat + periodic task if not client.set(lock_key, '1', nx=True, ex=60): return try: for host_id in self.cluster.hosts.iterkeys(): conn = self.cluster.get_local_client(host_id) keys = conn.zrange(self.pending_key, 0, -1) if not keys: continue keycount = 0 for key in keys: keycount += 1 process_incr.apply_async(kwargs={ 'key': key, }) pipe = conn.pipeline() pipe.zrem(self.pending_key, *keys) pipe.execute() metrics.timing('buffer.pending-size', keycount) finally: client.delete(lock_key)
def index_event_tags(organization_id, project_id, event_id, tags, group_id, environment_id, date_added=None, **kwargs): from sentry import tagstore Raven.tags_context({ 'project': project_id, }) create_event_tags_kwargs = {} if date_added is not None: create_event_tags_kwargs['date_added'] = date_added metrics.timing( 'tagstore.tags_per_event', len(tags), tags={ 'organization_id': organization_id, } ) tagstore.create_event_tags( project_id=project_id, group_id=group_id, environment_id=environment_id, event_id=event_id, tags=tags, **create_event_tags_kwargs )
def normalize(self): with metrics.timer('events.store.normalize.duration'): self._normalize_impl() metrics.timing( 'events.store.normalize.errors', len(self._data.get("errors") or ()), )
def set(self, key, attachments, timeout=None): key = self.make_key(key) for index, attachment in enumerate(attachments): compressed = zlib.compress(attachment.data) self.inner.set(u'{}:{}'.format(key, index), compressed, timeout, raw=True) metrics_tags = {'type': attachment.type} metrics.incr('attachments.received', tags=metrics_tags, skip_internal=False) metrics.timing('attachments.blob-size.raw', len(attachment.data), tags=metrics_tags) metrics.timing('attachments.blob-size.compressed', len(compressed), tags=metrics_tags) meta = [attachment.meta() for attachment in attachments] self.inner.set(key, meta, timeout, raw=False)
def _record_time(self, request, status_code): if not hasattr(request, "_view_path"): return metrics.incr( "view.response", instance=request._view_path, tags={"method": request.method, "status_code": status_code} ) if not hasattr(request, "_start_time"): return ms = int((time.time() - request._start_time) * 1000) metrics.timing("view.duration", ms, instance=request._view_path, tags={"method": request.method})
def _capture_stats(event, is_new): # TODO(dcramer): limit platforms to... something? group = event.group platform = group.platform if not platform: return platform = platform.split('-', 1)[0].split('_', 1)[0] if is_new: metrics.incr('events.unique') metrics.incr('events.processed') metrics.incr('events.processed.{platform}'.format(platform=platform)) metrics.timing('events.size.data', event.size)
def _capture_stats(event, is_new): group = event.group platform = group.platform or group.project.platform if not platform: return platform = PLATFORM_ROOTS.get(platform, platform) if platform not in PLATFORM_LIST: return if is_new: metrics.incr("events.unique", 1) metrics.incr("events.processed", 1) metrics.incr("events.processed.{platform}".format(platform=platform), 1) metrics.timing("events.size.data", len(unicode(event.data)))
def process(self, request, project, key, auth, helper, data, attachments=None, **kwargs): metrics.incr('events.total', skip_internal=False) if not data: track_outcome(project.organization_id, project.id, key.id, Outcome.INVALID, "no_data") raise APIError('No JSON data was found') remote_addr = request.META['REMOTE_ADDR'] event_manager = EventManager( data, project=project, key=key, auth=auth, client_ip=remote_addr, user_agent=helper.context.agent, version=auth.version, content_encoding=request.META.get('HTTP_CONTENT_ENCODING', ''), ) del data self.pre_normalize(event_manager, helper) event_manager.normalize() data = event_manager.get_data() dict_data = dict(data) data_size = len(json.dumps(dict_data)) if data_size > 10000000: metrics.timing('events.size.rejected', data_size) track_outcome( project.organization_id, project.id, key.id, Outcome.INVALID, 'too_large', event_id=dict_data.get('event_id') ) raise APIForbidden("Event size exceeded 10MB after normalization.") metrics.timing( 'events.size.data.post_storeendpoint', data_size, tags={'project_id': project.id} ) return process_event(event_manager, project, key, remote_addr, helper, attachments)
def process_pending(self, partition=None): if partition is None and self.pending_partitions > 1: # If we're using partitions, this one task fans out into # N subtasks instead. for i in range(self.pending_partitions): process_pending.apply_async(kwargs={'partition': i}) # Explicitly also run over the unpartitioned buffer as well # to ease in transition. In practice, this should just be # super fast and is fine to do redundantly. pending_key = self._make_pending_key(partition) client = self.cluster.get_routing_client() lock_key = self._make_lock_key(pending_key) # prevent a stampede due to celerybeat + periodic task if not client.set(lock_key, '1', nx=True, ex=60): return pending_buffer = PendingBuffer(self.incr_batch_size) try: keycount = 0 with self.cluster.all() as conn: results = conn.zrange(pending_key, 0, -1) with self.cluster.all() as conn: for host_id, keys in six.iteritems(results.value): if not keys: continue keycount += len(keys) for key in keys: pending_buffer.append(key) if pending_buffer.full(): process_incr.apply_async( kwargs={ 'batch_keys': pending_buffer.flush(), } ) conn.target([host_id]).zrem(pending_key, *keys) # queue up remainder of pending keys if not pending_buffer.empty(): process_incr.apply_async(kwargs={ 'batch_keys': pending_buffer.flush(), }) metrics.timing('buffer.pending-size', keycount) finally: client.delete(lock_key)
def _capture_stats(event, is_new): # TODO(dcramer): limit platforms to... something? group = event.group platform = group.platform if not platform: return platform = platform.split('-', 1)[0].split('_', 1)[0] tags = { 'platform': platform, } if is_new: metrics.incr('events.unique', tags=tags, skip_internal=False) metrics.incr('events.processed', tags=tags, skip_internal=False) metrics.incr(u'events.processed.{platform}'.format(platform=platform), skip_internal=False) metrics.timing('events.size.data', event.size, tags=tags)
def save_event(cache_key=None, data=None, start_time=None, event_id=None, **kwargs): """ Saves an event to the database. """ from sentry.event_manager import HashDiscarded, EventManager from sentry import tsdb if cache_key: data = default_cache.get(cache_key) if event_id is None and data is not None: event_id = data['event_id'] if data is None: metrics.incr('events.failed', tags={'reason': 'cache', 'stage': 'post'}) return project = data.pop('project') delete_raw_event(project, event_id, allow_hint_clear=True) Raven.tags_context({ 'project': project, }) try: manager = EventManager(data) manager.save(project) except HashDiscarded as exc: # TODO(jess): remove this before it goes out to a wider audience info_logger.info( 'discarded.hash', extra={ 'project_id': project, 'description': exc.message, } ) tsdb.incr(tsdb.models.project_total_received_discarded, project, timestamp=start_time) finally: if cache_key: default_cache.delete(cache_key) if start_time: metrics.timing( 'events.time-to-process', time() - start_time, instance=data['platform'])
def from_file(cls, fileobj): """ Retrieve a FileBlob instance for the given file. If not already present, this will cause it to be stored. >>> blob = FileBlob.from_file(fileobj) """ size = 0 checksum = sha1('') for chunk in fileobj: size += len(chunk) checksum.update(chunk) checksum = checksum.hexdigest() lock_key = 'fileblob:upload:{}'.format(checksum) # TODO(dcramer): the database here is safe, but if this lock expires # and duplicate files are uploaded then we need to prune one with Lock(lock_key, timeout=600): # test for presence try: existing = FileBlob.objects.get(checksum=checksum) except FileBlob.DoesNotExist: pass else: return existing blob = cls( size=size, checksum=checksum, storage=settings.SENTRY_FILESTORE, storage_options=settings.SENTRY_FILESTORE_OPTIONS, ) blob.path = cls.generate_unique_path(blob.timestamp) storage = blob.get_storage() storage.save(blob.path, fileobj) blob.save() metrics.timing('filestore.blob-size', blob.size) return blob
def from_file(cls, fileobj): """ Retrieve a list of FileBlobIndex instances for the given file. If not already present, this will cause it to be stored. >>> blobs = FileBlob.from_file(fileobj) """ size = 0 checksum = sha1(b'') for chunk in fileobj: size += len(chunk) checksum.update(chunk) checksum = checksum.hexdigest() # TODO(dcramer): the database here is safe, but if this lock expires # and duplicate files are uploaded then we need to prune one lock = locks.get(u'fileblob:upload:{}'.format(checksum), duration=60 * 10) with TimedRetryPolicy(60)(lock.acquire): # test for presence try: existing = FileBlob.objects.get(checksum=checksum) except FileBlob.DoesNotExist: pass else: return existing blob = cls( size=size, checksum=checksum, ) blob.path = cls.generate_unique_path(blob.timestamp) storage = get_storage() storage.save(blob.path, fileobj) blob.save() metrics.timing('filestore.blob-size', size) return blob
def _record_time(self, request, status_code): if not hasattr(request, '_view_path'): return metrics.incr( 'view.response', instance=request._view_path, tags={ 'method': request.method, 'status_code': status_code, } ) if not hasattr(request, '_start_time'): return ms = int((time.time() - request._start_time) * 1000) metrics.timing( 'view.duration', ms, instance=request._view_path, tags={ 'method': request.method, } )
def save_event(cache_key=None, data=None, start_time=None, **kwargs): """ Saves an event to the database. """ from sentry.event_manager import EventManager if cache_key: data = default_cache.get(cache_key) if data is None: return project = data.pop('project') try: manager = EventManager(data) manager.save(project) finally: if cache_key: default_cache.delete(cache_key) if start_time: metrics.timing('events.time-to-process', time() - start_time)
def get_task_kwargs_for_message(value): """ Decodes a message body, returning a dictionary of keyword arguments that can be applied to a post-processing task, or ``None`` if no task should be dispatched. """ metrics.timing('evenstream.events.size.data', len(value)) payload = json.loads(value) try: version = payload[0] except Exception: raise InvalidPayload('Received event payload with unexpected structure') try: handler = version_handlers[int(version)] except (ValueError, KeyError): raise InvalidVersion( 'Received event payload with unexpected version identifier: {}'.format(version)) return handler(*payload[1:])
def get(self, request, public_key, minified): """Returns a js file that can be integrated into a website""" start_time = time.time() key = None try: key = ProjectKey.objects.get_from_cache( public_key=public_key ) except ProjectKey.DoesNotExist: pass context, sdk_version, sdk_url = self._get_context(key) instance = "default" if not sdk_url: instance = "noop" tmpl = 'sentry/js-sdk-loader-noop.js.tmpl' elif minified is not None: instance = "minified" tmpl = 'sentry/js-sdk-loader.min.js.tmpl' else: tmpl = 'sentry/js-sdk-loader.js.tmpl' metrics.incr('js-sdk-loader.rendered', instance=instance, skip_internal=False) response = render_to_response(tmpl, context, content_type="text/javascript") response['Access-Control-Allow-Origin'] = '*' response['Cache-Control'] = CACHE_CONTROL if sdk_version and key: response['Surrogate-Key'] = 'project/%s sdk/%s sdk-loader' % ( key.project_id, sdk_version) ms = int((time.time() - start_time) * 1000) metrics.timing('js-sdk-loader.duration', ms, instance=instance) return response
def scrub_data(project, event): for config in get_all_pii_configs(project): metrics.timing( "datascrubbing.config.num_applications", len(config.get("applications") or ()) ) total_rules = 0 for selector, rules in (config.get("applications") or {}).items(): metrics.timing("datascrubbing.config.selectors.size", len(selector)) metrics.timing("datascrubbing.config.rules_per_selector.size", len(rules)) total_rules += len(rules) metrics.timing("datascrubbing.config.rules.size", total_rules) event = sentry_relay.pii_strip_event(config, event) return event
def from_file(cls, fileobj, logger=nooplogger): """ Retrieve a single FileBlob instances for the given file. """ logger.debug("FileBlob.from_file.start") size, checksum = _get_size_and_checksum(fileobj) # TODO(dcramer): the database here is safe, but if this lock expires # and duplicate files are uploaded then we need to prune one with _locked_blob(checksum, logger=logger) as existing: if existing is not None: return existing blob = cls(size=size, checksum=checksum) blob.path = cls.generate_unique_path() storage = get_storage() storage.save(blob.path, fileobj) blob.save() metrics.timing("filestore.blob-size", size) logger.debug("FileBlob.from_file.end") return blob
def _capture_stats(event, is_new): # TODO(dcramer): limit platforms to... something? platform = event.group.platform if event.group else event.platform if not platform: return platform = platform.split("-", 1)[0].split("_", 1)[0] tags = {"platform": platform} if is_new: metrics.incr("events.unique", tags=tags, skip_internal=False) metrics.incr("events.processed", tags=tags, skip_internal=False) metrics.incr(u"events.processed.{platform}".format(platform=platform), skip_internal=False) metrics.timing("events.size.data", event.size, tags=tags) # This is an experiment to understand whether we have, in production, # mismatches between event and group before we permanently rely on events # for the platform. before adding some more verbose logging on this # case, using a stats will give us a sense of the magnitude of the problem. if event.group: if event.group.platform != event.platform: metrics.incr("events.platform_mismatch", tags=tags)
def _upload_and_pend_chunk(fileobj, size, checksum, lock): logger.debug( "FileBlob.from_files._upload_and_pend_chunk.start", extra={ "checksum": checksum, "size": size }, ) blob = cls(size=size, checksum=checksum) blob.path = cls.generate_unique_path() storage = get_storage() storage.save(blob.path, fileobj) blobs_to_save.append((blob, lock)) metrics.timing("filestore.blob-size", size, tags={"function": "from_files"}) logger.debug( "FileBlob.from_files._upload_and_pend_chunk.end", extra={ "checksum": checksum, "path": blob.path }, )
def process_pending(self): client = self.cluster.get_routing_client() lock_key = self._make_lock_key(self.pending_key) # prevent a stampede due to celerybeat + periodic task if not client.set(lock_key, '1', nx=True, ex=60): return pending_buffer = PendingBuffer(self.incr_batch_size) try: keycount = 0 with self.cluster.all() as conn: results = conn.zrange(self.pending_key, 0, -1) with self.cluster.all() as conn: for host_id, keys in six.iteritems(results.value): if not keys: continue keycount += len(keys) for key in keys: pending_buffer.append(key) if pending_buffer.full(): process_incr.apply_async( kwargs={ 'batch_keys': pending_buffer.flush(), }) conn.target([host_id]).zrem(self.pending_key, *keys) # queue up remainder of pending keys if not pending_buffer.empty(): process_incr.apply_async(kwargs={ 'batch_keys': pending_buffer.flush(), }) metrics.timing('buffer.pending-size', keycount) finally: client.delete(lock_key)
def get(self, request, public_key, minified): """Returns a js file that can be integrated into a website""" start_time = time.time() key = None try: key = ProjectKey.objects.get_from_cache(public_key=public_key) except ProjectKey.DoesNotExist: pass else: key.project = Project.objects.get_from_cache(id=key.project_id) context, sdk_version, sdk_url = self._get_context(key) instance = "default" if not sdk_url: instance = "noop" tmpl = "sentry/js-sdk-loader-noop.js.tmpl" elif minified is not None: instance = "minified" tmpl = "sentry/js-sdk-loader.min.js.tmpl" else: tmpl = "sentry/js-sdk-loader.js.tmpl" metrics.incr("js-sdk-loader.rendered", instance=instance, skip_internal=False) response = render_to_response(tmpl, context, content_type="text/javascript") response["Access-Control-Allow-Origin"] = "*" response["Cache-Control"] = CACHE_CONTROL if sdk_version and key: response["Surrogate-Key"] = f"project/{key.project_id} sdk/{sdk_version} sdk-loader" ms = int((time.time() - start_time) * 1000) metrics.timing("js-sdk-loader.duration", ms, instance=instance) return response
def _record_time(self, request, status_code): if not hasattr(request, '_view_path'): return metrics.incr( 'view.response', instance=request._view_path, tags={ 'method': request.method, 'status_code': status_code, }, skip_internal=False, ) if not hasattr(request, '_start_time'): return ms = int((time.time() - request._start_time) * 1000) metrics.timing('view.duration', ms, instance=request._view_path, tags={ 'method': request.method, })
def assemble_from_file_blob_ids(self, file_blob_ids, checksum, commit=True): """ This creates a file, from file blobs and returns a temp file with the contents. """ tf = tempfile.NamedTemporaryFile() with transaction.atomic(): file_blobs = FileBlob.objects.filter(id__in=file_blob_ids).all() # Make sure the blobs are sorted with the order provided file_blobs = sorted(file_blobs, key=lambda blob: file_blob_ids.index(blob.id)) new_checksum = sha1(b'') offset = 0 for blob in file_blobs: FileBlobIndex.objects.create( file=self, blob=blob, offset=offset, ) for chunk in blob.getfile().chunks(): new_checksum.update(chunk) tf.write(chunk) offset += blob.size self.size = offset self.checksum = new_checksum.hexdigest() if checksum != self.checksum: raise AssembleChecksumMismatch('Checksum mismatch') metrics.timing('filestore.file-size', offset) if commit: self.save() tf.flush() tf.seek(0) return tf
def __call__(self, function): start = self.clock.time() try: for i in itertools.count(1): try: return function() except self.exceptions as error: if self.log_original_error: logger.info(error) delay = self.delay(i) now = self.clock.time() if (now + delay) > (start + self.timeout): raise RetryException( "Could not successfully execute %r within %.3f seconds (%s attempts.)" % (function, now - start, i), error, ) else: logger.debug( "Failed to execute %r due to %r on attempt #%s, retrying in %s seconds...", function, error, i, delay, ) self.clock.sleep(delay) finally: if self.metric_instance: from sentry.utils import metrics metrics.timing( "timedretrypolicy.duration", self.clock.time() - start, instance=self.metric_instance, tags=self.metric_tags, )
def get_task_kwargs_for_message(value): """ Decodes a message body, returning a dictionary of keyword arguments that can be applied to a post-processing task, or ``None`` if no task should be dispatched. """ metrics.timing("eventstream.events.size.data", len(value)) payload = json.loads(value) try: version = payload[0] except Exception: raise InvalidPayload( "Received event payload with unexpected structure") try: handler = version_handlers[int(version)] except (ValueError, KeyError): raise InvalidVersion( "Received event payload with unexpected version identifier: {}". format(version)) return handler(*payload[1:])
def putfile(self, fileobj, blob_size=DEFAULT_BLOB_SIZE, commit=True, logger=nooplogger): """ Save a fileobj into a number of chunks. Returns a list of `FileBlobIndex` items. >>> indexes = file.putfile(fileobj) """ results = [] offset = 0 checksum = sha1(b"") while True: contents = fileobj.read(blob_size) if not contents: break checksum.update(contents) blob_fileobj = ContentFile(contents) blob = FileBlob.from_file(blob_fileobj, logger=logger) results.append( FileBlobIndex.objects.create(file=self, blob=blob, offset=offset)) offset += blob.size self.size = offset self.checksum = checksum.hexdigest() metrics.timing("filestore.file-size", offset) if commit: self.save() return results
def scrub_data(project_config, event, in_processing=False, old_event=None): for config in get_all_pii_configs(project_config): metrics.timing( "datascrubbing.config.num_applications", len(config.get("applications") or ()) ) total_rules = 0 for selector, rules in six.iteritems(config.get("applications") or {}): metrics.timing("datascrubbing.config.selectors.size", len(selector)) metrics.timing("datascrubbing.config.rules_per_selector.size", len(rules)) total_rules += len(rules) metrics.timing("datascrubbing.config.rules.size", total_rules) if in_processing: assert old_event is not None config = _narrow_pii_config_for_processing(config, old_event, event) event = sentry_relay.pii_strip_event(config, event) return event
def cleanup(days, project, concurrency, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() os.environ['_SENTRY_CLEANUP'] = '1' # Make sure we fork off multiprocessing pool # before we import or configure the app from multiprocessing import Process, JoinableQueue as Queue pool = [] task_queue = Queue(1000) for _ in xrange(concurrency): p = Process(target=multiprocess_worker, args=(task_queue,)) p.daemon = True p.start() pool.append(p) from sentry.runner import configure configure() from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = [ (models.EventMapping, 'date_added', '-date_added'), (models.EventAttachment, 'date_added', None), (models.UserReport, 'date_added', None), (models.GroupEmailThread, 'date', None), (models.GroupRuleStatus, 'date_added', None), ] + EXTRA_BULK_QUERY_DELETES # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ( (models.Event, 'datetime', 'datetime'), (models.Group, 'last_seen', 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered(models.LostPasswordHash): if not silent: click.echo('>> Skipping LostPasswordHash') else: models.LostPasswordHash.objects.filter( date_added__lte=timezone.now() - timedelta(hours=48) ).delete() if is_filtered(models.OrganizationMember) and not silent: click.echo('>> Skipping OrganizationMember') else: click.echo('Removing expired values for OrganizationMember') expired_threshold = timezone.now() - timedelta(days=days) models.OrganizationMember.delete_expired(expired_threshold) for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo(u'Removing expired values for {}'.format(model.__name__)) if is_filtered(model): if not silent: click.echo(u'>> Skipping {}'.format(model.__name__)) else: model.objects.filter( expires_at__lt=(timezone.now() - timedelta(days=days)), ).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo( "NodeStore backend does not support cleanup operation", err=True) for bqd in BULK_QUERY_DELETES: if len(bqd) == 4: model, dtfield, order_by, chunk_size = bqd else: chunk_size = 10000 model, dtfield, order_by = bqd if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', ) ) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ).execute(chunk_size=chunk_size) for model, dtfield, order_by in DELETES: if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', ) ) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: imp = '.'.join((model.__module__, model.__name__)) q = BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ) for chunk in q.iterator(chunk_size=100): task_queue.put((imp, chunk)) task_queue.join() # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) # Shut down our pool for _ in pool: task_queue.put(_STOP_WORKER) # And wait for it to drain for p in pool: p.join() if timed: duration = int(time.time() - start_time) metrics.timing('cleanup.duration', duration, instance=router) click.echo("Clean up took %s second(s)." % duration)
def callback_timing( context: Context, method_name: str, callargs: Mapping[str, Any], backend_names: Sequence[str], results: Sequence[TimedFuture], metric_name: str, result_comparator: Optional[Callable[[str, str, str, Any, Any], Mapping[str, str]]] = None, sample_rate: Optional[float] = None, ) -> None: """ Collects timing stats on results returned to the callback method of a `ServiceDelegator`. Either partial this and pass it directly as the `callback_func` or :param metric_name: Prefix to use when writing these timing metrics to Datadog :param method_name: method_name passed to callback :param backend_names: backend_names passed to callback :param results: results passed to callback :param result_comparator: An optional comparator to compare the primary result to each secondary result. Should return a dict represents the result of the comparison. This will be merged into tags to be stored in the metrics backend. :return: """ if not len(backend_names) > 1: return primary_backend_name = backend_names[0] primary_future = results[0] primary_status = get_future_status(primary_future) primary_timing = primary_future.get_timing() # If either endpoint of the timing data is not set, just ignore this call. # This really shouldn't happen on the primary backend, but playing it safe # here out of an abundance of caution. if not all(primary_timing): logger.warning( "Received timing with unexpected endpoint: %r, primary_backend_name: %r, future_status: %r", primary_timing, primary_backend_name, primary_status, ) return primary_duration_ms = (primary_timing[1] - primary_timing[0]) * 1000 metric_kwargs = {} if sample_rate is not None: metric_kwargs["sample_rate"] = sample_rate metrics.timing( f"{metric_name}.timing_ms", primary_duration_ms, tags={ "method": method_name, "backend": primary_backend_name, "status": primary_status, "primary": "true", }, **metric_kwargs, ) for i, secondary_backend_name in enumerate(backend_names[1:], 1): secondary_future = results[i] secondary_timing = secondary_future.get_timing() secondary_status = get_future_status(secondary_future) tags = { "method": method_name, "primary_backend": primary_backend_name, "primary_status": primary_status, "secondary_backend": secondary_backend_name, "secondary_status": secondary_status, } if result_comparator: comparator_result = result_comparator( method_name, primary_status, secondary_status, primary_future.result(), secondary_future.result(), ) tags.update(comparator_result) # If either endpoint of the timing data is not set, this means # something weird happened (more than likely a cancellation.) if not all(secondary_timing): metrics.incr( f"{metric_name}.timing_invalid", tags={**tags, "reason": get_invalid_timing_reason(secondary_timing)}, ) else: secondary_duration_ms = (secondary_timing[1] - secondary_timing[0]) * 1000 metrics.timing( f"{metric_name}.timing_ms", secondary_duration_ms, tags={ "method": method_name, "backend": secondary_backend_name, "status": secondary_status, "primary": "false", }, **metric_kwargs, ) metrics.timing( f"{metric_name}.timing_delta_ms", secondary_duration_ms - primary_duration_ms, tags=tags, **metric_kwargs, ) metrics.timing( f"{metric_name}.timing_relative_delta", secondary_duration_ms / primary_duration_ms, tags=tags, **metric_kwargs, )
def _do_save_event(cache_key=None, data=None, start_time=None, event_id=None, project_id=None, **kwargs): """ Saves an event to the database. """ from sentry.event_manager import HashDiscarded, EventManager, track_outcome from sentry import quotas from sentry.models import ProjectKey if cache_key and data is None: data = default_cache.get(cache_key) if data is not None: data = CanonicalKeyDict(data) if event_id is None and data is not None: event_id = data['event_id'] # only when we come from reprocessing we get a project_id sent into # the task. if project_id is None: project_id = data.pop('project') key_id = None if data is None else data.get('key_id') timestamp = to_datetime(start_time) if start_time is not None else None delete_raw_event(project_id, event_id, allow_hint_clear=True) # This covers two cases: where data is None because we did not manage # to fetch it from the default cache or the empty dictionary was # stored in the default cache. The former happens if the event # expired while being on the queue, the second happens on reprocessing # if the raw event was deleted concurrently while we held on to # it. This causes the node store to delete the data and we end up # fetching an empty dict. We could in theory not invoke `save_event` # in those cases but it's important that we always clean up the # reprocessing reports correctly or they will screw up the UI. So # to future proof this correctly we just handle this case here. if not data: metrics.incr('events.failed', tags={ 'reason': 'cache', 'stage': 'post' }, skip_internal=False) return with configure_scope() as scope: scope.set_tag("project", project_id) event = None try: manager = EventManager(data) event = manager.save(project_id, assume_normalized=True) # Always load attachments from the cache so we can later prune them. # Only save them if the event-attachments feature is active, though. if features.has('organizations:event-attachments', event.project.organization, actor=None): attachments = attachment_cache.get(cache_key) or [] for attachment in attachments: save_attachment(event, attachment) # This is where we can finally say that we have accepted the event. track_outcome(event.project.organization_id, event.project.id, key_id, 'accepted', None, timestamp) except HashDiscarded: project = Project.objects.get_from_cache(id=project_id) reason = FilterStatKeys.DISCARDED_HASH project_key = None try: if key_id is not None: project_key = ProjectKey.objects.get_from_cache(id=key_id) except ProjectKey.DoesNotExist: pass quotas.refund(project, key=project_key, timestamp=start_time) track_outcome(project.organization_id, project_id, key_id, 'filtered', reason, timestamp) finally: if cache_key: default_cache.delete(cache_key) # For the unlikely case that we did not manage to persist the # event we also delete the key always. if event is None or \ features.has('organizations:event-attachments', event.project.organization, actor=None): attachment_cache.delete(cache_key) if start_time: metrics.timing('events.time-to-process', time() - start_time, instance=data['platform'])
def save(self, project_id, raw=False, assume_normalized=False, cache_key=None): """ After normalizing and processing an event, save adjacent models such as releases and environments to postgres and write the event into eventstream. From there it will be picked up by Snuba and post-processing. We re-insert events with duplicate IDs into Snuba, which is responsible for deduplicating events. Since deduplication in Snuba is on the primary key (based on event ID, project ID and day), events with same IDs are only deduplicated if their timestamps fall on the same day. The latest event always wins and overwrites the value of events received earlier in that day. Since we increment counters and frequencies here before events get inserted to eventstream these numbers may be larger than the total number of events if we receive duplicate event IDs that fall on the same day (that do not hit cache first). """ # Normalize if needed if not self._normalized: if not assume_normalized: self.normalize() self._normalized = True with metrics.timer("event_manager.save.project.get_from_cache"): project = Project.objects.get_from_cache(id=project_id) with metrics.timer("event_manager.save.organization.get_from_cache"): project._organization_cache = Organization.objects.get_from_cache( id=project.organization_id) job = {"data": self._data, "project_id": project_id, "raw": raw} jobs = [job] projects = {project.id: project} _pull_out_data(jobs, projects) # Right now the event type is the signal to skip the group. This # is going to change a lot. if job["event"].get_event_type() == "transaction": issueless_event = True else: issueless_event = False _get_or_create_release_many(jobs, projects) # XXX: remove if job["dist"] and job["release"]: job["dist"] = job["release"].add_dist(job["dist"], job["event"].datetime) # dont allow a conflicting 'dist' tag pop_tag(job["data"], "dist") set_tag(job["data"], "sentry:dist", job["dist"].name) else: job["dist"] = None _get_event_user_many(jobs, projects) with metrics.timer("event_manager.load_grouping_config"): # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. grouping_config = load_grouping_config( get_grouping_config_dict_for_event_data(job["data"], project)) with metrics.timer("event_manager.normalize_stacktraces_for_grouping"): normalize_stacktraces_for_grouping(job["data"], grouping_config) _derive_plugin_tags_many(jobs, projects) _derive_interface_tags_many(jobs) with metrics.timer("event_manager.apply_server_fingerprinting"): # The active grouping config was put into the event in the # normalize step before. We now also make sure that the # fingerprint was set to `'{{ default }}' just in case someone # removed it from the payload. The call to get_hashes will then # look at `grouping_config` to pick the right parameters. job["data"]["fingerprint"] = job["data"].get("fingerprint") or [ "{{ default }}" ] apply_server_fingerprinting( job["data"], get_fingerprinting_config_for_project(project)) with metrics.timer("event_manager.event.get_hashes"): # Here we try to use the grouping config that was requested in the # event. If that config has since been deleted (because it was an # experimental grouping config) we fall back to the default. try: hashes = job["event"].get_hashes() except GroupingConfigNotFound: job["data"][ "grouping_config"] = get_grouping_config_dict_for_project( project) hashes = job["event"].get_hashes() job["data"]["hashes"] = hashes _materialize_metadata_many(jobs) job["received_timestamp"] = received_timestamp = job["event"].data.get( "received") or float(job["event"].datetime.strftime("%s")) if not issueless_event: # The group gets the same metadata as the event when it's flushed but # additionally the `last_received` key is set. This key is used by # _save_aggregate. group_metadata = dict(job["materialized_metadata"]) group_metadata["last_received"] = received_timestamp kwargs = { "platform": job["platform"], "message": job["event"].search_message, "culprit": job["culprit"], "logger": job["logger_name"], "level": LOG_LEVELS_MAP.get(job["level"]), "last_seen": job["event"].datetime, "first_seen": job["event"].datetime, "active_at": job["event"].datetime, "data": group_metadata, } if job["release"]: kwargs["first_release"] = job["release"] try: job["group"], job["is_new"], job[ "is_regression"] = _save_aggregate(event=job["event"], hashes=hashes, release=job["release"], **kwargs) except HashDiscarded: event_discarded.send_robust(project=project, sender=EventManager) metrics.incr( "events.discarded", skip_internal=True, tags={ "organization_id": project.organization_id, "platform": job["platform"] }, ) raise job["event"].group = job["group"] else: job["group"] = None job["is_new"] = False job["is_regression"] = False _send_event_saved_signal_many(jobs, projects) # store a reference to the group id to guarantee validation of isolation # XXX(markus): No clue what this does job["event"].data.bind_ref(job["event"]) _get_or_create_environment_many(jobs, projects) if job["group"]: group_environment, job[ "is_new_group_environment"] = GroupEnvironment.get_or_create( group_id=job["group"].id, environment_id=job["environment"].id, defaults={"first_release": job["release"] or None}, ) else: job["is_new_group_environment"] = False _get_or_create_release_associated_models(jobs, projects) if job["release"] and job["group"]: job["grouprelease"] = GroupRelease.get_or_create( group=job["group"], release=job["release"], environment=job["environment"], datetime=job["event"].datetime, ) _tsdb_record_all_metrics(jobs) if job["group"]: UserReport.objects.filter(project=project, event_id=job["event"].event_id).update( group=job["group"], environment=job["environment"]) # Enusre the _metrics key exists. This is usually created during # and prefilled with ingestion sizes. event_metrics = job["event"].data.get("_metrics") or {} job["event"].data["_metrics"] = event_metrics # Capture the actual size that goes into node store. event_metrics["bytes.stored.event"] = len( json.dumps(dict(job["event"].data.items()))) if not issueless_event: # Load attachments first, but persist them at the very last after # posting to eventstream to make sure all counters and eventstream are # incremented for sure. attachments = get_attachments(cache_key, job["event"]) for attachment in attachments: key = "bytes.stored.%s" % (attachment.type, ) event_metrics[key] = (event_metrics.get(key) or 0) + len( attachment.data) _nodestore_save_many(jobs) if job["release"] and not issueless_event: if job["is_new"]: buffer.incr( ReleaseProject, {"new_groups": 1}, { "release_id": job["release"].id, "project_id": project.id }, ) if job["is_new_group_environment"]: buffer.incr( ReleaseProjectEnvironment, {"new_issues_count": 1}, { "project_id": project.id, "release_id": job["release"].id, "environment_id": job["environment"].id, }, ) if not raw: if not project.first_event: project.update(first_event=job["event"].datetime) first_event_received.send_robust(project=project, event=job["event"], sender=Project) _eventstream_insert_many(jobs) if not issueless_event: # Do this last to ensure signals get emitted even if connection to the # file store breaks temporarily. save_attachments(attachments, job["event"]) metric_tags = {"from_relay": "_relay_processed" in job["data"]} metrics.timing("events.latency", received_timestamp - job["recorded_timestamp"], tags=metric_tags) metrics.timing("events.size.data.post_save", job["event"].size, tags=metric_tags) metrics.incr( "events.post_save.normalize.errors", amount=len(job["data"].get("errors") or ()), tags=metric_tags, ) self._data = job["event"].data.data return job["event"]
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def snuba_search( start, end, project_ids, environment_ids, sort_field, cursor=None, candidate_ids=None, limit=None, offset=0, get_sample=False, search_filters=None, ): """ This function doesn't strictly benefit from or require being pulled out of the main query method above, but the query method is already large and this function at least extracts most of the Snuba-specific logic. Returns a tuple of: * a sorted list of (group_id, group_score) tuples sorted descending by score, * the count of total results (rows) available for this query. """ filters = {"project_id": project_ids} if environment_ids is not None: filters["environment"] = environment_ids if candidate_ids: filters["group_id"] = sorted(candidate_ids) conditions = [] having = [] for search_filter in search_filters: if ( # Don't filter on issue fields here, they're not available search_filter.key.name in issue_only_fields or # We special case date search_filter.key.name == "date"): continue converted_filter = convert_search_filter_to_snuba_query(search_filter) # Ensure that no user-generated tags that clashes with aggregation_defs is added to having if search_filter.key.name in aggregation_defs and not search_filter.key.is_tag: having.append(converted_filter) else: conditions.append(converted_filter) extra_aggregations = dependency_aggregations.get(sort_field, []) required_aggregations = set([sort_field, "total"] + extra_aggregations) for h in having: alias = h[0] required_aggregations.add(alias) aggregations = [] for alias in required_aggregations: aggregations.append(aggregation_defs[alias] + [alias]) if cursor is not None: having.append( (sort_field, ">=" if cursor.is_prev else "<=", cursor.value)) selected_columns = [] if get_sample: query_hash = md5(repr(conditions)).hexdigest()[:8] selected_columns.append( ("cityHash64", ("'{}'".format(query_hash), "group_id"), "sample")) sort_field = "sample" orderby = [sort_field] referrer = "search_sample" else: # Get the top matching groups by score, i.e. the actual search results # in the order that we want them. orderby = ["-{}".format(sort_field), "group_id"] # ensure stable sort within the same score referrer = "search" snuba_results = snuba.dataset_query( dataset=Dataset.Events, start=start, end=end, selected_columns=selected_columns, groupby=["group_id"], conditions=conditions, having=having, filter_keys=filters, aggregations=aggregations, orderby=orderby, referrer=referrer, limit=limit, offset=offset, totals= True, # Needs to have totals_mode=after_having_exclusive so we get groups matching HAVING only turbo=get_sample, # Turn off FINAL when in sampling mode sample=1, # Don't use clickhouse sampling, even when in turbo mode. ) rows = snuba_results["data"] total = snuba_results["totals"]["total"] if not get_sample: metrics.timing("snuba.search.num_result_groups", len(rows)) return [(row["group_id"], row[sort_field]) for row in rows], total
def assemble_download( data_export_id, export_limit=EXPORTED_ROWS_LIMIT, batch_size=SNUBA_MAX_RESULTS, offset=0, bytes_written=0, environment_id=None, **kwargs ): with sentry_sdk.start_transaction( op="task.data_export.assemble", name="DataExportAssemble", sampled=True, ): first_page = offset == 0 try: if first_page: logger.info("dataexport.start", extra={"data_export_id": data_export_id}) data_export = ExportedData.objects.get(id=data_export_id) if first_page: metrics.incr("dataexport.start", tags={"success": True}, sample_rate=1.0) logger.info( "dataexport.run", extra={"data_export_id": data_export_id, "offset": offset} ) except ExportedData.DoesNotExist as error: if first_page: metrics.incr("dataexport.start", tags={"success": False}, sample_rate=1.0) logger.exception(error) return with sentry_sdk.configure_scope() as scope: if data_export.user: user = {} if data_export.user.id: user["id"] = data_export.user.id if data_export.user.username: user["username"] = data_export.user.username if data_export.user.email: user["email"] = data_export.user.email scope.user = user scope.set_tag("organization.slug", data_export.organization.slug) scope.set_tag("export.type", ExportQueryType.as_str(data_export.query_type)) scope.set_extra("export.query", data_export.query_info) try: # ensure that the export limit is set and capped at EXPORTED_ROWS_LIMIT if export_limit is None: export_limit = EXPORTED_ROWS_LIMIT else: export_limit = min(export_limit, EXPORTED_ROWS_LIMIT) processor = get_processor(data_export, environment_id) with tempfile.TemporaryFile(mode="w+b") as tf: # XXX(python3): # # In python2 land we write utf-8 encoded strings as bytes via # the csv writer (see convert_to_utf8). The CSV writer will # ONLY write bytes, even if you give it unicode it will convert # it to bytes. # # In python3 we write unicode strings (which is all the csv # module is able to do, it will NOT write bytes like in py2). # Because of this we use the codec getwriter to transform our # file handle to a stream writer that will encode to utf8. if six.PY2: tfw = tf else: tfw = codecs.getwriter("utf-8")(tf) writer = csv.DictWriter(tfw, processor.header_fields, extrasaction="ignore") if first_page: writer.writeheader() # the position in the file at the end of the headers starting_pos = tf.tell() # the row offset relative to the start of the current task # this offset tells you the number of rows written during this batch fragment fragment_offset = 0 # the absolute row offset from the beginning of the export next_offset = offset + fragment_offset while True: # the number of rows to export in the next batch fragment fragment_row_count = min(batch_size, max(export_limit - next_offset, 1)) rows = process_rows(processor, data_export, fragment_row_count, next_offset) writer.writerows(rows) fragment_offset += len(rows) next_offset = offset + fragment_offset if ( not rows or len(rows) < batch_size # the batch may exceed MAX_BATCH_SIZE but immediately stops or tf.tell() - starting_pos >= MAX_BATCH_SIZE ): break tf.seek(0) new_bytes_written = store_export_chunk_as_blob(data_export, bytes_written, tf) bytes_written += new_bytes_written except ExportError as error: return data_export.email_failure(message=six.text_type(error)) except Exception as error: metrics.incr("dataexport.error", tags={"error": six.text_type(error)}, sample_rate=1.0) logger.error( "dataexport.error: %s", six.text_type(error), extra={"query": data_export.payload, "org": data_export.organization_id}, ) capture_exception(error) try: current.retry() except MaxRetriesExceededError: metrics.incr( "dataexport.end", tags={"success": False, "error": six.text_type(error)}, sample_rate=1.0, ) return data_export.email_failure(message="Internal processing failure") else: if ( rows and len(rows) >= batch_size and new_bytes_written and next_offset < export_limit ): assemble_download.delay( data_export_id, export_limit=export_limit, batch_size=batch_size, offset=next_offset, bytes_written=bytes_written, environment_id=environment_id, ) else: metrics.timing("dataexport.row_count", next_offset, sample_rate=1.0) metrics.timing("dataexport.file_size", bytes_written, sample_rate=1.0) merge_export_blobs.delay(data_export_id)
def merge_export_blobs(data_export_id, **kwargs): with sentry_sdk.start_transaction( op="task.data_export.merge", name="DataExportMerge", sampled=True, ): try: data_export = ExportedData.objects.get(id=data_export_id) except ExportedData.DoesNotExist as error: logger.exception(error) return with sentry_sdk.configure_scope() as scope: if data_export.user: user = {} if data_export.user.id: user["id"] = data_export.user.id if data_export.user.username: user["username"] = data_export.user.username if data_export.user.email: user["email"] = data_export.user.email scope.user = user scope.user = user scope.set_tag("organization.slug", data_export.organization.slug) scope.set_tag("export.type", ExportQueryType.as_str(data_export.query_type)) scope.set_extra("export.query", data_export.query_info) # adapted from `putfile` in `src/sentry/models/file.py` try: with transaction.atomic(): file = File.objects.create( name=data_export.file_name, type="export.csv", headers={"Content-Type": "text/csv"}, ) size = 0 file_checksum = sha1(b"") for export_blob in ExportedDataBlob.objects.filter( data_export=data_export ).order_by("offset"): blob = export_blob.blob FileBlobIndex.objects.create(file=file, blob=blob, offset=size) size += blob.size blob_checksum = sha1(b"") for chunk in blob.getfile().chunks(): blob_checksum.update(chunk) file_checksum.update(chunk) if blob.checksum != blob_checksum.hexdigest(): raise AssembleChecksumMismatch("Checksum mismatch") file.size = size file.checksum = file_checksum.hexdigest() file.save() data_export.finalize_upload(file=file) time_elapsed = (timezone.now() - data_export.date_added).total_seconds() metrics.timing("dataexport.duration", time_elapsed, sample_rate=1.0) logger.info("dataexport.end", extra={"data_export_id": data_export_id}) metrics.incr("dataexport.end", tags={"success": True}, sample_rate=1.0) except Exception as error: metrics.incr("dataexport.error", tags={"error": six.text_type(error)}, sample_rate=1.0) metrics.incr( "dataexport.end", tags={"success": False, "error": six.text_type(error)}, sample_rate=1.0, ) logger.error( "dataexport.error: %s", six.text_type(error), extra={"query": data_export.payload, "org": data_export.organization_id}, ) capture_exception(error) if isinstance(error, IntegrityError): message = "Failed to save the assembled file." else: message = "Internal processing failure." return data_export.email_failure(message=message)
def track_memory_usage(metric, **kwargs): before = get_rss_usage() try: yield finally: metrics.timing(metric, get_rss_usage() - before, **kwargs)
def cleanup(days, project, concurrency, max_procs, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() import math import multiprocessing import pickle import subprocess import sys from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = [ (models.EventMapping, 'date_added', '-date_added'), (models.GroupHashTombstone, 'deleted_at', None), (models.GroupEmailThread, 'date', None), (models.GroupRuleStatus, 'date_added', None), ] + EXTRA_BULK_QUERY_DELETES # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ( (models.Event, 'datetime', 'datetime'), (models.Group, 'last_seen', 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered(models.LostPasswordHash): if not silent: click.echo('>> Skipping LostPasswordHash') else: models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() - timedelta(hours=48)).delete() for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo('Removing expired values for {}'.format(model.__name__)) if is_filtered(model): if not silent: click.echo('>> Skipping {}'.format(model.__name__)) else: model.objects.filter(expires_at__lt=timezone.now()).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo("NodeStore backend does not support cleanup operation", err=True) for bqd in BULK_QUERY_DELETES: if len(bqd) == 4: model, dtfield, order_by, chunk_size = bqd else: chunk_size = 10000 model, dtfield, order_by = bqd if not silent: click.echo( "Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ).execute(chunk_size=chunk_size) for model, dtfield, order_by in DELETES: if not silent: click.echo( "Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: if concurrency > 1: shard_ids = range(concurrency) num_procs = min(multiprocessing.cpu_count(), max_procs) threads_per_proc = int( math.ceil(concurrency / float(num_procs))) pids = [] for shard_id_chunk in chunker(shard_ids, threads_per_proc): pid = subprocess.Popen([ sys.argv[0], 'cleanup_chunk', '--days', six.binary_type(days), ] + ( ['--project_id', six.binary_type(project_id)] if project_id else [] ) + [ '--model', pickle.dumps(model), '--dtfield', dtfield, '--order_by', order_by, '--num_shards', six.binary_type(concurrency), '--shard_ids', ",".join([six.binary_type(s) for s in shard_id_chunk]), ]) pids.append(pid) total_pid_count = len(pids) click.echo( "%s concurrent processes forked, waiting on them to complete." % total_pid_count) complete = 0 for pid in pids: pid.wait() complete += 1 click.echo("%s/%s concurrent processes are finished." % (complete, total_pid_count)) else: task = create_deletion_task(days, project_id, model, dtfield, order_by) _chunk_until_complete(task) # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) if timed: duration = int(time.time() - start_time) metrics.timing('cleanup.duration', duration, instance=router) click.echo("Clean up took %s second(s)." % duration)
def snuba_search(project_id, environment_id, tags, start, end, sort, extra_aggregations, score_fn, candidate_hashes, **parameters): """ This function doesn't strictly benefit from or require being pulled out of the main query method above, but the query method is already large and this function at least extracts most of the Snuba-specific logic. Returns an OrderedDict of {group_id: group_score, ...} sorted descending by score. """ from sentry.search.base import ANY filters = { 'project_id': [project_id], } if environment_id is not None: filters['environment'] = [environment_id] if candidate_hashes is not None: filters['primary_hash'] = candidate_hashes.keys() having = SnubaConditionBuilder({ 'age_from': ScalarCondition('first_seen', '>'), 'age_to': ScalarCondition('first_seen', '<'), 'last_seen_from': ScalarCondition('last_seen', '>'), 'last_seen_to': ScalarCondition('last_seen', '<'), 'times_seen': CallbackCondition( lambda times_seen: ('times_seen', '=', times_seen), ), 'times_seen_lower': ScalarCondition('times_seen', '>'), 'times_seen_upper': ScalarCondition('times_seen', '<'), }).build(parameters) conditions = [] for tag, val in six.iteritems(tags): col = u'tags[{}]'.format(tag) if val == ANY: conditions.append((col, '!=', '')) else: conditions.append((col, '=', val)) required_aggregations = set([sort] + extra_aggregations) for h in having: alias = h[0] required_aggregations.add(alias) aggregations = [] for alias in required_aggregations: aggregations.append(aggregation_defs[alias] + [alias]) # {hash -> {<agg_alias> -> <agg_value>, # <agg_alias> -> <agg_value>, # ...}, # ...} # _OR_ if there's only one <agg_alias> in use # {hash -> <agg_value>, # ...} snuba_results = snuba.query( start=start, end=end, groupby=['primary_hash'], conditions=conditions, having=having, filter_keys=filters, aggregations=aggregations, orderby='-' + sort, referrer='search', ) metrics.timing('snuba.search.num_result_hashes', len(snuba_results.keys())) # {hash -> group_id, ...} if candidate_hashes is not None: # any hash coming back had to come from our candidate set hash_to_group = candidate_hashes else: hash_to_group = dict( GroupHash.objects.filter( project_id=project_id, hash__in=snuba_results.keys() ).values_list( 'hash', 'group_id' ) ) # {group_id -> {field1: [...all values from field1 for all hashes...], # field2: [...all values from field2 for all hashes...] # ...} # ...} group_data = {} for hash, obj in snuba_results.items(): if hash in hash_to_group: group_id = hash_to_group[hash] if group_id not in group_data: group_data[group_id] = defaultdict(list) dest = group_data[group_id] # NOTE: The Snuba utility code is trying to be helpful by collapsing # results with only one aggregate down to the single value. It's a # bit of a hack that we then immediately undo that work here, but # many other callers get value out of that functionality. If we see # this pattern again we should either add an option to opt-out of # the 'help' here or remove it from the Snuba code altogether. if len(required_aggregations) == 1: alias = list(required_aggregations)[0] dest[alias].append(obj) else: for k, v in obj.items(): dest[k].append(v) else: logger.warning( 'search.hash_not_found', extra={ 'project_id': project_id, 'hash': hash, }, ) return OrderedDict( sorted(((gid, score_fn(data)) for gid, data in group_data.items()), key=lambda t: t[1], reverse=True) )
def save(self, project, raw=False): from sentry.tasks.post_process import index_event_tags data = self.data project = Project.objects.get_from_cache(id=project) # Check to make sure we're not about to do a bunch of work that's # already been done if we've processed an event with this ID. (This # isn't a perfect solution -- this doesn't handle ``EventMapping`` and # there's a race condition between here and when the event is actually # saved, but it's an improvement. See GH-7677.) try: event = Event.objects.get( project_id=project.id, event_id=data['event_id'], ) except Event.DoesNotExist: pass else: self.logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': data['event_id'], 'project_id': project.id, 'model': Event.__name__, } ) return event # First we pull out our top-level (non-data attr) kwargs event_id = data.pop('event_id') level = data.pop('level') transaction_name = data.pop('transaction', None) culprit = data.pop('culprit', None) logger_name = data.pop('logger', None) server_name = data.pop('server_name', None) site = data.pop('site', None) checksum = data.pop('checksum', None) fingerprint = data.pop('fingerprint', None) platform = data.pop('platform', None) release = data.pop('release', None) dist = data.pop('dist', None) environment = data.pop('environment', None) # unused time_spent = data.pop('time_spent', None) message = data.pop('message', '') if not culprit: if transaction_name: culprit = transaction_name else: culprit = generate_culprit(data, platform=platform) culprit = force_text(culprit) if transaction_name: transaction_name = force_text(transaction_name) recorded_timestamp = data.pop('timestamp') date = datetime.fromtimestamp(recorded_timestamp) date = date.replace(tzinfo=timezone.utc) kwargs = { 'platform': platform, } event = Event( project_id=project.id, event_id=event_id, data=data, time_spent=time_spent, datetime=date, **kwargs ) event._project_cache = project data = event.data.data # convert this to a dict to ensure we're only storing one value per key # as most parts of Sentry dont currently play well with multiple values tags = dict(data.get('tags') or []) tags['level'] = LOG_LEVELS[level] if logger_name: tags['logger'] = logger_name if server_name: tags['server_name'] = server_name if site: tags['site'] = site if environment: tags['environment'] = environment if transaction_name: tags['transaction'] = transaction_name if release: # dont allow a conflicting 'release' tag if 'release' in tags: del tags['release'] release = Release.get_or_create( project=project, version=release, date_added=date, ) tags['sentry:release'] = release.version if dist and release: dist = release.add_dist(dist, date) tags['sentry:dist'] = dist.name else: dist = None event_user = self._get_event_user(project, data) if event_user: # dont allow a conflicting 'user' tag if 'user' in tags: del tags['user'] tags['sentry:user'] = event_user.tag_value # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. normalize_in_app(data) for plugin in plugins.for_project(project, version=None): added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False) if added_tags: # plugins should not override user provided tags for key, value in added_tags: tags.setdefault(key, value) for path, iface in six.iteritems(event.interfaces): for k, v in iface.iter_tags(): tags[k] = v # Get rid of ephemeral interface data if iface.ephemeral: data.pop(iface.get_path(), None) # tags are stored as a tuple tags = tags.items() data['tags'] = tags data['fingerprint'] = fingerprint or ['{{ default }}'] # prioritize fingerprint over checksum as its likely the client defaulted # a checksum whereas the fingerprint was explicit if fingerprint: hashes = [md5_from_hash(h) for h in get_hashes_from_fingerprint(event, fingerprint)] elif checksum: if HASH_RE.match(checksum): hashes = [checksum] else: hashes = [md5_from_hash([checksum]), checksum] data['checksum'] = checksum else: hashes = [md5_from_hash(h) for h in get_hashes_for_event(event)] # TODO(dcramer): temp workaround for complexity data['message'] = message event_type = eventtypes.get(data.get('type', 'default'))(data) event_metadata = event_type.get_metadata() # TODO(dcramer): temp workaround for complexity del data['message'] data['type'] = event_type.key data['metadata'] = event_metadata # index components into ``Event.message`` # See GH-3248 if event_type.key != 'default': if 'sentry.interfaces.Message' in data and \ data['sentry.interfaces.Message']['message'] != message: message = u'{} {}'.format( message, data['sentry.interfaces.Message']['message'], ) if not message: message = '' elif not isinstance(message, six.string_types): message = force_text(message) for value in six.itervalues(event_metadata): value_u = force_text(value, errors='replace') if value_u not in message: message = u'{} {}'.format(message, value_u) if culprit and culprit not in message: culprit_u = force_text(culprit, errors='replace') message = u'{} {}'.format(message, culprit_u) message = trim(message.strip(), settings.SENTRY_MAX_MESSAGE_LENGTH) event.message = message kwargs['message'] = message received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s')) group_kwargs = kwargs.copy() group_kwargs.update( { 'culprit': culprit, 'logger': logger_name, 'level': level, 'last_seen': date, 'first_seen': date, 'active_at': date, 'data': { 'last_received': received_timestamp, 'type': event_type.key, # we cache the events metadata on the group to ensure its # accessible in the stream 'metadata': event_metadata, }, } ) if release: group_kwargs['first_release'] = release try: group, is_new, is_regression, is_sample = self._save_aggregate( event=event, hashes=hashes, release=release, **group_kwargs ) except HashDiscarded: event_discarded.send_robust( project=project, sender=EventManager, ) metrics.incr( 'events.discarded', skip_internal=True, tags={ 'organization_id': project.organization_id, 'platform': platform, }, ) raise else: event_saved.send_robust( project=project, event_size=event.size, sender=EventManager, ) event.group = group # store a reference to the group id to guarantee validation of isolation event.data.bind_ref(event) # When an event was sampled, the canonical source of truth # is the EventMapping table since we aren't going to be writing out an actual # Event row. Otherwise, if the Event isn't being sampled, we can safely # rely on the Event table itself as the source of truth and ignore # EventMapping since it's redundant information. if is_sample: try: with transaction.atomic(using=router.db_for_write(EventMapping)): EventMapping.objects.create(project=project, group=group, event_id=event_id) except IntegrityError: self.logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': EventMapping.__name__, } ) return event environment = Environment.get_or_create( project=project, name=environment, ) group_environment, is_new_group_environment = GroupEnvironment.get_or_create( group_id=group.id, environment_id=environment.id, defaults={ 'first_release_id': release.id if release else None, }, ) if release: ReleaseEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) ReleaseProjectEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) grouprelease = GroupRelease.get_or_create( group=group, release=release, environment=environment, datetime=date, ) counters = [ (tsdb.models.group, group.id), (tsdb.models.project, project.id), ] if release: counters.append((tsdb.models.release, release.id)) tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id) frequencies = [ # (tsdb.models.frequent_projects_by_organization, { # project.organization_id: { # project.id: 1, # }, # }), # (tsdb.models.frequent_issues_by_project, { # project.id: { # group.id: 1, # }, # }) (tsdb.models.frequent_environments_by_group, { group.id: { environment.id: 1, }, }) ] if release: frequencies.append( (tsdb.models.frequent_releases_by_group, { group.id: { grouprelease.id: 1, }, }) ) tsdb.record_frequency_multi(frequencies, timestamp=event.datetime) UserReport.objects.filter( project=project, event_id=event_id, ).update( group=group, environment=environment, ) # save the event unless its been sampled if not is_sample: try: with transaction.atomic(using=router.db_for_write(Event)): event.save() except IntegrityError: self.logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': Event.__name__, } ) return event index_event_tags.delay( organization_id=project.organization_id, project_id=project.id, group_id=group.id, environment_id=environment.id, event_id=event.id, tags=tags, date_added=event.datetime, ) if event_user: tsdb.record_multi( ( (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )), (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )), ), timestamp=event.datetime, environment_id=environment.id, ) if release: if is_new: buffer.incr( ReleaseProject, {'new_groups': 1}, { 'release_id': release.id, 'project_id': project.id, } ) if is_new_group_environment: buffer.incr( ReleaseProjectEnvironment, {'new_issues_count': 1}, { 'project_id': project.id, 'release_id': release.id, 'environment_id': environment.id, } ) safe_execute(Group.objects.add_tags, group, environment, tags, _with_transaction=False) if not raw: if not project.first_event: project.update(first_event=date) first_event_received.send_robust(project=project, group=group, sender=Project) eventstream.insert( group=group, event=event, is_new=is_new, is_sample=is_sample, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=hashes[0], # We are choosing to skip consuming the event back # in the eventstream if it's flagged as raw. # This means that we want to publish the event # through the event stream, but we don't care # about post processing and handling the commit. skip_consume=raw, ) metrics.timing( 'events.latency', received_timestamp - recorded_timestamp, tags={ 'project_id': project.id, }, ) return event
def cleanup(days, project, concurrency, max_procs, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() import math import multiprocessing import pickle import subprocess import sys from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = [ (models.EventMapping, 'date_added', '-date_added'), (models.GroupHashTombstone, 'deleted_at', None), (models.GroupEmailThread, 'date', None), (models.GroupRuleStatus, 'date_added', None), ] + EXTRA_BULK_QUERY_DELETES # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ( (models.Event, 'datetime', 'datetime'), (models.Group, 'last_seen', 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered(models.LostPasswordHash): if not silent: click.echo('>> Skipping LostPasswordHash') else: models.LostPasswordHash.objects.filter( date_added__lte=timezone.now() - timedelta(hours=48) ).delete() for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo('Removing expired values for {}'.format(model.__name__)) if is_filtered(model): if not silent: click.echo('>> Skipping {}'.format(model.__name__)) else: model.objects.filter(expires_at__lt=timezone.now()).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo( "NodeStore backend does not support cleanup operation", err=True) for bqd in BULK_QUERY_DELETES: if len(bqd) == 4: model, dtfield, order_by, chunk_size = bqd else: chunk_size = 10000 model, dtfield, order_by = bqd if not silent: click.echo( "Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', ) ) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ).execute(chunk_size=chunk_size) for model, dtfield, order_by in DELETES: if not silent: click.echo( "Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', ) ) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: if concurrency > 1: shard_ids = range(concurrency) num_procs = min(multiprocessing.cpu_count(), max_procs) threads_per_proc = int(math.ceil( concurrency / float(num_procs))) pids = [] for shard_id_chunk in chunker(shard_ids, threads_per_proc): pid = subprocess.Popen([ sys.argv[0], 'cleanup_chunk', '--days', six.binary_type(days), ] + (['--project_id', six.binary_type(project_id)] if project_id else []) + [ '--model', pickle.dumps(model), '--dtfield', dtfield, '--order_by', order_by, '--num_shards', six.binary_type(concurrency), '--shard_ids', ",".join([six.binary_type(s) for s in shard_id_chunk]), ]) pids.append(pid) total_pid_count = len(pids) click.echo( "%s concurrent processes forked, waiting on them to complete." % total_pid_count) complete = 0 for pid in pids: pid.wait() complete += 1 click.echo( "%s/%s concurrent processes are finished." % (complete, total_pid_count)) else: task = create_deletion_task( days, project_id, model, dtfield, order_by) _chunk_until_complete(task) # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) if timed: duration = int(time.time() - start_time) metrics.timing('cleanup.duration', duration, instance=router) click.echo("Clean up took %s second(s)." % duration)
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, ): now = timezone.now() end = None end_params = filter( None, [date_to, get_search_filter(search_filters, "date", "<")]) if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and not environments and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in issue_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [retention_window_start, now - timedelta(days=90)])) # TODO: We should try and consolidate all this logic together a little # better, maybe outside the backend. Should be easier once we're on # just the new search filters start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max(filter(None, start_params)) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") too_many_candidates = False candidate_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) metrics.timing("snuba.search.num_candidates", len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True candidate_ids = [] sort_field = sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() if count_hits and (too_many_candidates or cursor is not None): # If we had too many candidates to reasonably pass down to snuba, # or if we have a cursor that bisects the overall result set (such # that our query only sees results on one side of the cursor) then # we need an alternative way to figure out the total hits that this # query has. # To do this, we get a sample of groups matching the snuba side of # the query, and see how many of those pass the post-filter in # postgres. This should give us an estimate of the total number of # snuba matches that will be overall matches, which we can use to # get an estimate for X-Hits. # The sampling is not simple random sampling. It will return *all* # matching groups if there are less than N groups matching the # query, or it will return a random, deterministic subset of N of # the groups if there are more than N overall matches. This means # that the "estimate" is actually an accurate result when there are # less than N matching groups. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get("snuba.search.hits-sample-size") snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, search_filters=search_filters, ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids)) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def save(self, project_id, raw=False, assume_normalized=False, cache_key=None): """ We re-insert events with duplicate IDs into Snuba, which is responsible for deduplicating events. Since deduplication in Snuba is on the primary key (based on event ID, project ID and day), events with same IDs are only deduplicated if their timestamps fall on the same day. The latest event always wins and overwrites the value of events received earlier in that day. Since we increment counters and frequencies here before events get inserted to eventstream these numbers may be larger than the total number of events if we receive duplicate event IDs that fall on the same day (that do not hit cache first). """ # Normalize if needed if not self._normalized: if not assume_normalized: self.normalize() self._normalized = True data = self._data project = Project.objects.get_from_cache(id=project_id) project._organization_cache = Organization.objects.get_from_cache( id=project.organization_id) # Pull out the culprit culprit = self.get_culprit() # Pull the toplevel data we're interested in level = data.get("level") # TODO(mitsuhiko): this code path should be gone by July 2018. # This is going to be fine because no code actually still depends # on integers here. When we need an integer it will be converted # into one later. Old workers used to send integers here. if level is not None and isinstance(level, six.integer_types): level = LOG_LEVELS[level] transaction_name = data.get("transaction") logger_name = data.get("logger") release = data.get("release") dist = data.get("dist") environment = data.get("environment") recorded_timestamp = data.get("timestamp") # We need to swap out the data with the one internal to the newly # created event object event = self._get_event_instance(project_id=project_id) self._data = data = event.data.data event._project_cache = project date = event.datetime platform = event.platform event_id = event.event_id if transaction_name: transaction_name = force_text(transaction_name) # Right now the event type is the signal to skip the group. This # is going to change a lot. if event.get_event_type() == "transaction": issueless_event = True else: issueless_event = False # Some of the data that are toplevel attributes are duplicated # into tags (logger, level, environment, transaction). These are # different from legacy attributes which are normalized into tags # ahead of time (site, server_name). setdefault_path(data, "tags", value=[]) set_tag(data, "level", level) if logger_name: set_tag(data, "logger", logger_name) if environment: set_tag(data, "environment", environment) if transaction_name: set_tag(data, "transaction", transaction_name) if release: # dont allow a conflicting 'release' tag pop_tag(data, "release") release = Release.get_or_create(project=project, version=release, date_added=date) set_tag(data, "sentry:release", release.version) if dist and release: dist = release.add_dist(dist, date) # dont allow a conflicting 'dist' tag pop_tag(data, "dist") set_tag(data, "sentry:dist", dist.name) else: dist = None event_user = self._get_event_user(project, data) if event_user: # dont allow a conflicting 'user' tag pop_tag(data, "user") set_tag(data, "sentry:user", event_user.tag_value) # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. grouping_config = load_grouping_config( get_grouping_config_dict_for_event_data(data, project)) normalize_stacktraces_for_grouping(data, grouping_config) for plugin in plugins.for_project(project, version=None): added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False) if added_tags: # plugins should not override user provided tags for key, value in added_tags: if get_tag(data, key) is None: set_tag(data, key, value) for path, iface in six.iteritems(event.interfaces): for k, v in iface.iter_tags(): set_tag(data, k, v) # Get rid of ephemeral interface data if iface.ephemeral: data.pop(iface.path, None) # The active grouping config was put into the event in the # normalize step before. We now also make sure that the # fingerprint was set to `'{{ default }}' just in case someone # removed it from the payload. The call to get_hashes will then # look at `grouping_config` to pick the right parameters. data["fingerprint"] = data.get("fingerprint") or ["{{ default }}"] apply_server_fingerprinting( data, get_fingerprinting_config_for_project(project)) # Here we try to use the grouping config that was requested in the # event. If that config has since been deleted (because it was an # experimental grouping config) we fall back to the default. try: hashes = event.get_hashes() except GroupingConfigNotFound: data["grouping_config"] = get_grouping_config_dict_for_project( project) hashes = event.get_hashes() data["hashes"] = hashes # we want to freeze not just the metadata and type in but also the # derived attributes. The reason for this is that we push this # data into kafka for snuba processing and our postprocessing # picks up the data right from the snuba topic. For most usage # however the data is dynamically overridden by Event.title and # Event.location (See Event.as_dict) materialized_metadata = self.materialize_metadata() data.update(materialized_metadata) data["culprit"] = culprit received_timestamp = event.data.get("received") or float( event.datetime.strftime("%s")) if not issueless_event: # The group gets the same metadata as the event when it's flushed but # additionally the `last_received` key is set. This key is used by # _save_aggregate. group_metadata = dict(materialized_metadata) group_metadata["last_received"] = received_timestamp kwargs = { "platform": platform, "message": event.search_message, "culprit": culprit, "logger": logger_name, "level": LOG_LEVELS_MAP.get(level), "last_seen": date, "first_seen": date, "active_at": date, "data": group_metadata, } if release: kwargs["first_release"] = release try: group, is_new, is_regression = self._save_aggregate( event=event, hashes=hashes, release=release, **kwargs) except HashDiscarded: event_discarded.send_robust(project=project, sender=EventManager) metrics.incr( "events.discarded", skip_internal=True, tags={ "organization_id": project.organization_id, "platform": platform }, ) raise else: event_saved.send_robust(project=project, event_size=event.size, sender=EventManager) event.group = group else: group = None is_new = False is_regression = False event_saved.send_robust(project=project, event_size=event.size, sender=EventManager) # store a reference to the group id to guarantee validation of isolation event.data.bind_ref(event) environment = Environment.get_or_create(project=project, name=environment) if group: group_environment, is_new_group_environment = GroupEnvironment.get_or_create( group_id=group.id, environment_id=environment.id, defaults={"first_release": release if release else None}, ) else: is_new_group_environment = False if release: ReleaseEnvironment.get_or_create(project=project, release=release, environment=environment, datetime=date) ReleaseProjectEnvironment.get_or_create(project=project, release=release, environment=environment, datetime=date) if group: grouprelease = GroupRelease.get_or_create( group=group, release=release, environment=environment, datetime=date) counters = [(tsdb.models.project, project.id)] if group: counters.append((tsdb.models.group, group.id)) if release: counters.append((tsdb.models.release, release.id)) tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id) frequencies = [] if group: frequencies.append((tsdb.models.frequent_environments_by_group, { group.id: { environment.id: 1 } })) if release: frequencies.append((tsdb.models.frequent_releases_by_group, { group.id: { grouprelease.id: 1 } })) if frequencies: tsdb.record_frequency_multi(frequencies, timestamp=event.datetime) if group: UserReport.objects.filter(project=project, event_id=event_id).update( group=group, environment=environment) # Enusre the _metrics key exists. This is usually created during # and prefilled with ingestion sizes. event_metrics = event.data.get("_metrics") or {} event.data["_metrics"] = event_metrics # Capture the actual size that goes into node store. event_metrics["bytes.stored.event"] = len( json.dumps(dict(event.data.items()))) # Load attachments first, but persist them at the very last after # posting to eventstream to make sure all counters and eventstream are # incremented for sure. attachments = self.get_attachments(cache_key, event) for attachment in attachments: key = "bytes.stored.%s" % (attachment.type, ) event_metrics[key] = (event_metrics.get(key) or 0) + len( attachment.data) # Write the event to Nodestore event.data.save() if event_user: counters = [(tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, ))] if group: counters.append((tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, ))) tsdb.record_multi(counters, timestamp=event.datetime, environment_id=environment.id) if release: if is_new: buffer.incr( ReleaseProject, {"new_groups": 1}, { "release_id": release.id, "project_id": project.id }, ) if is_new_group_environment: buffer.incr( ReleaseProjectEnvironment, {"new_issues_count": 1}, { "project_id": project.id, "release_id": release.id, "environment_id": environment.id, }, ) if not raw: if not project.first_event: project.update(first_event=date) first_event_received.send_robust(project=project, event=event, sender=Project) eventstream.insert( group=group, event=event, is_new=is_new, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=hashes[0], # We are choosing to skip consuming the event back # in the eventstream if it's flagged as raw. # This means that we want to publish the event # through the event stream, but we don't care # about post processing and handling the commit. skip_consume=raw, ) # Do this last to ensure signals get emitted even if connection to the # file store breaks temporarily. self.save_attachments(attachments, event) metric_tags = {"from_relay": "_relay_processed" in self._data} metrics.timing("events.latency", received_timestamp - recorded_timestamp, tags=metric_tags) metrics.timing("events.size.data.post_save", event.size, tags=metric_tags) metrics.incr( "events.post_save.normalize.errors", amount=len(self._data.get("errors") or ()), tags=metric_tags, ) return event
def timer(name, prefix="snuba.client"): t = time.time() try: yield finally: metrics.timing(u"{}.{}".format(prefix, name), time.time() - t)
def save_event(cache_key=None, data=None, start_time=None, event_id=None, project_id=None, **kwargs): """ Saves an event to the database. """ from sentry.event_manager import HashDiscarded, EventManager from sentry import quotas, tsdb from sentry.models import ProjectKey if cache_key: data = default_cache.get(cache_key) if data is not None: data = CanonicalKeyDict(data) if event_id is None and data is not None: event_id = data['event_id'] # only when we come from reprocessing we get a project_id sent into # the task. if project_id is None: project_id = data.pop('project') delete_raw_event(project_id, event_id, allow_hint_clear=True) # This covers two cases: where data is None because we did not manage # to fetch it from the default cache or the empty dictionary was # stored in the default cache. The former happens if the event # expired while being on the queue, the second happens on reprocessing # if the raw event was deleted concurrently while we held on to # it. This causes the node store to delete the data and we end up # fetching an empty dict. We could in theory not invoke `save_event` # in those cases but it's important that we always clean up the # reprocessing reports correctly or they will screw up the UI. So # to future proof this correctly we just handle this case here. if not data: metrics.incr('events.failed', tags={ 'reason': 'cache', 'stage': 'post' }) return Raven.tags_context({ 'project': project_id, }) try: manager = EventManager(data) event = manager.save(project_id) # Always load attachments from the cache so we can later prune them. # Only save them if the event-attachments feature is active, though. if features.has('organizations:event-attachments', event.project.organization, actor=None): attachments = attachment_cache.get(cache_key) or [] for attachment in attachments: save_attachment(event, attachment) except HashDiscarded: increment_list = [ (tsdb.models.project_total_received_discarded, project_id), ] try: project = Project.objects.get_from_cache(id=project_id) except Project.DoesNotExist: pass else: increment_list.extend([ (tsdb.models.project_total_blacklisted, project.id), (tsdb.models.organization_total_blacklisted, project.organization_id), ]) project_key = None if data.get('key_id') is not None: try: project_key = ProjectKey.objects.get_from_cache( id=data['key_id']) except ProjectKey.DoesNotExist: pass else: increment_list.append( (tsdb.models.key_total_blacklisted, project_key.id)) quotas.refund( project, key=project_key, timestamp=start_time, ) tsdb.incr_multi( increment_list, timestamp=to_datetime(start_time) if start_time is not None else None, ) finally: if cache_key: default_cache.delete(cache_key) attachment_cache.delete(cache_key) if start_time: metrics.timing('events.time-to-process', time() - start_time, instance=data['platform'])
def timer(name, prefix='snuba.client'): t = time.time() try: yield finally: metrics.timing(u'{}.{}'.format(prefix, name), time.time() - t)
def cleanup(days, project, concurrency, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() os.environ['_SENTRY_CLEANUP'] = '1' # Make sure we fork off multiprocessing pool # before we import or configure the app from multiprocessing import Process, JoinableQueue as Queue pool = [] task_queue = Queue(1000) for _ in xrange(concurrency): p = Process(target=multiprocess_worker, args=(task_queue, )) p.daemon = True p.start() pool.append(p) from sentry.runner import configure configure() from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = [ (models.EventMapping, 'date_added', '-date_added'), (models.EventAttachment, 'date_added', None), (models.UserReport, 'date_added', None), (models.GroupEmailThread, 'date', None), (models.GroupRuleStatus, 'date_added', None), ] + EXTRA_BULK_QUERY_DELETES # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ( (models.Event, 'datetime', 'datetime'), (models.Group, 'last_seen', 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered(models.LostPasswordHash): if not silent: click.echo('>> Skipping LostPasswordHash') else: models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() - timedelta(hours=48)).delete() if is_filtered(models.OrganizationMember) and not silent: click.echo('>> Skipping OrganizationMember') else: if not silent: click.echo('Removing expired values for OrganizationMember') expired_threshold = timezone.now() - timedelta(days=days) models.OrganizationMember.delete_expired(expired_threshold) for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo(u'Removing expired values for {}'.format( model.__name__)) if is_filtered(model): if not silent: click.echo(u'>> Skipping {}'.format(model.__name__)) else: model.objects.filter(expires_at__lt=( timezone.now() - timedelta(days=API_TOKEN_TTL_IN_DAYS)), ).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo("NodeStore backend does not support cleanup operation", err=True) for bqd in BULK_QUERY_DELETES: if len(bqd) == 4: model, dtfield, order_by, chunk_size = bqd else: chunk_size = 10000 model, dtfield, order_by = bqd if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ).execute(chunk_size=chunk_size) for model, dtfield, order_by in DELETES: if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: imp = '.'.join((model.__module__, model.__name__)) q = BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ) for chunk in q.iterator(chunk_size=100): task_queue.put((imp, chunk)) task_queue.join() # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) # Shut down our pool for _ in pool: task_queue.put(_STOP_WORKER) # And wait for it to drain for p in pool: p.join() if timed: duration = int(time.time() - start_time) metrics.timing('cleanup.duration', duration, instance=router, sample_rate=1.0) click.echo("Clean up took %s second(s)." % duration)
def set_commits(self, commit_list): """ Bind a list of commits to this release. This will clear any existing commit log and replace it with the given commits. """ # Sort commit list in reverse order commit_list.sort(key=lambda commit: commit.get('timestamp'), reverse=True) # TODO(dcramer): this function could use some cleanup/refactoring as its a bit unwieldly from sentry.models import (Commit, CommitAuthor, Group, GroupLink, GroupResolution, GroupStatus, ReleaseCommit, ReleaseHeadCommit, Repository, PullRequest) from sentry.plugins.providers.repository import RepositoryProvider from sentry.tasks.integrations import kick_off_status_syncs # todo(meredith): implement for IntegrationRepositoryProvider commit_list = [ c for c in commit_list if not RepositoryProvider.should_ignore_commit(c.get('message', '')) ] lock_key = type(self).get_lock_key(self.organization_id, self.id) lock = locks.get(lock_key, duration=10) with TimedRetryPolicy(10)(lock.acquire): start = time() with transaction.atomic(): # TODO(dcramer): would be good to optimize the logic to avoid these # deletes but not overly important ReleaseCommit.objects.filter(release=self, ).delete() authors = {} repos = {} commit_author_by_commit = {} head_commit_by_repo = {} latest_commit = None for idx, data in enumerate(commit_list): repo_name = data.get( 'repository') or u'organization-{}'.format( self.organization_id) if repo_name not in repos: repos[ repo_name] = repo = Repository.objects.get_or_create( organization_id=self.organization_id, name=repo_name, )[0] else: repo = repos[repo_name] author_email = data.get('author_email') if author_email is None and data.get('author_name'): author_email = (re.sub(r'[^a-zA-Z0-9\-_\.]*', '', data['author_name']).lower() + '@localhost') if not author_email: author = None elif author_email not in authors: author_data = {'name': data.get('author_name')} author, created = CommitAuthor.objects.create_or_update( organization_id=self.organization_id, email=author_email, values=author_data) if not created: author = CommitAuthor.objects.get( organization_id=self.organization_id, email=author_email) authors[author_email] = author else: author = authors[author_email] commit_data = {} defaults = {} # Update/set message and author if they are provided. if author is not None: commit_data['author'] = author if 'message' in data: commit_data['message'] = data['message'] if 'timestamp' in data: commit_data['date_added'] = data['timestamp'] else: defaults['date_added'] = timezone.now() commit, created = Commit.objects.create_or_update( organization_id=self.organization_id, repository_id=repo.id, key=data['id'], defaults=defaults, values=commit_data) if not created: commit = Commit.objects.get( organization_id=self.organization_id, repository_id=repo.id, key=data['id']) if author is None: author = commit.author commit_author_by_commit[commit.id] = author patch_set = data.get('patch_set', []) for patched_file in patch_set: try: with transaction.atomic(): CommitFileChange.objects.create( organization_id=self.organization.id, commit=commit, filename=patched_file['path'], type=patched_file['type'], ) except IntegrityError: pass try: with transaction.atomic(): ReleaseCommit.objects.create( organization_id=self.organization_id, release=self, commit=commit, order=idx, ) except IntegrityError: pass if latest_commit is None: latest_commit = commit head_commit_by_repo.setdefault(repo.id, commit.id) self.update( commit_count=len(commit_list), authors=[ six.text_type(a_id) for a_id in ReleaseCommit.objects.filter( release=self, commit__author_id__isnull=False, ).values_list('commit__author_id', flat=True).distinct() ], last_commit_id=latest_commit.id if latest_commit else None, ) metrics.timing('release.set_commits.duration', time() - start) # fill any missing ReleaseHeadCommit entries for repo_id, commit_id in six.iteritems(head_commit_by_repo): try: with transaction.atomic(): ReleaseHeadCommit.objects.create( organization_id=self.organization_id, release_id=self.id, repository_id=repo_id, commit_id=commit_id, ) except IntegrityError: pass release_commits = list( ReleaseCommit.objects.filter( release=self).select_related('commit').values( 'commit_id', 'commit__key')) commit_resolutions = list( GroupLink.objects.filter( linked_type=GroupLink.LinkedType.commit, linked_id__in=[rc['commit_id'] for rc in release_commits], ).values_list('group_id', 'linked_id')) commit_group_authors = [ ( cr[0], # group_id commit_author_by_commit.get(cr[1])) for cr in commit_resolutions ] pr_ids_by_merge_commit = list( PullRequest.objects.filter( merge_commit_sha__in=[ rc['commit__key'] for rc in release_commits ], organization_id=self.organization_id, ).values_list('id', flat=True)) pull_request_resolutions = list( GroupLink.objects.filter( relationship=GroupLink.Relationship.resolves, linked_type=GroupLink.LinkedType.pull_request, linked_id__in=pr_ids_by_merge_commit, ).values_list('group_id', 'linked_id')) pr_authors = list( PullRequest.objects.filter(id__in=[ prr[1] for prr in pull_request_resolutions ], ).select_related('author')) pr_authors_dict = {pra.id: pra.author for pra in pr_authors} pull_request_group_authors = [(prr[0], pr_authors_dict.get(prr[1])) for prr in pull_request_resolutions] user_by_author = {None: None} commits_and_prs = list( itertools.chain(commit_group_authors, pull_request_group_authors), ) group_project_lookup = dict( Group.objects.filter(id__in=[ group_id for group_id, _ in commits_and_prs ], ).values_list('id', 'project_id')) for group_id, author in commits_and_prs: if author not in user_by_author: try: user_by_author[author] = author.find_users()[0] except IndexError: user_by_author[author] = None actor = user_by_author[author] with transaction.atomic(): GroupResolution.objects.create_or_update( group_id=group_id, values={ 'release': self, 'type': GroupResolution.Type.in_release, 'status': GroupResolution.Status.resolved, 'actor_id': actor.id if actor else None, }, ) group = Group.objects.get(id=group_id, ) group.update(status=GroupStatus.RESOLVED) metrics.incr('group.resolved', instance='in_commit', skip_internal=True) issue_resolved.send_robust( organization_id=self.organization_id, user=actor, group=group, project=group.project, resolution_type='with_commit', sender=type(self), ) kick_off_status_syncs.apply_async( kwargs={ 'project_id': group_project_lookup[group_id], 'group_id': group_id, })
def save(self, project, raw=False): from sentry.tasks.post_process import index_event_tags data = self.data project = Project.objects.get_from_cache(id=project) # Check to make sure we're not about to do a bunch of work that's # already been done if we've processed an event with this ID. (This # isn't a perfect solution -- this doesn't handle ``EventMapping`` and # there's a race condition between here and when the event is actually # saved, but it's an improvement. See GH-7677.) try: event = Event.objects.get( project_id=project.id, event_id=data['event_id'], ) except Event.DoesNotExist: pass else: self.logger.info('duplicate.found', exc_info=True, extra={ 'event_uuid': data['event_id'], 'project_id': project.id, 'model': Event.__name__, }) return event # First we pull out our top-level (non-data attr) kwargs event_id = data.pop('event_id') level = data.pop('level') culprit = data.pop('transaction', None) if not culprit: culprit = data.pop('culprit', None) logger_name = data.pop('logger', None) server_name = data.pop('server_name', None) site = data.pop('site', None) checksum = data.pop('checksum', None) fingerprint = data.pop('fingerprint', None) platform = data.pop('platform', None) release = data.pop('release', None) dist = data.pop('dist', None) environment = data.pop('environment', None) # unused time_spent = data.pop('time_spent', None) message = data.pop('message', '') if not culprit: # if we generate an implicit culprit, lets not call it a # transaction transaction_name = None culprit = generate_culprit(data, platform=platform) else: transaction_name = culprit culprit = force_text(culprit) recorded_timestamp = data.pop('timestamp') date = datetime.fromtimestamp(recorded_timestamp) date = date.replace(tzinfo=timezone.utc) kwargs = { 'platform': platform, } event = Event(project_id=project.id, event_id=event_id, data=data, time_spent=time_spent, datetime=date, **kwargs) event._project_cache = project # convert this to a dict to ensure we're only storing one value per key # as most parts of Sentry dont currently play well with multiple values tags = dict(data.get('tags') or []) tags['level'] = LOG_LEVELS[level] if logger_name: tags['logger'] = logger_name if server_name: tags['server_name'] = server_name if site: tags['site'] = site if environment: tags['environment'] = environment if transaction_name: tags['transaction'] = transaction_name if release: # dont allow a conflicting 'release' tag if 'release' in tags: del tags['release'] release = Release.get_or_create( project=project, version=release, date_added=date, ) tags['sentry:release'] = release.version if dist and release: dist = release.add_dist(dist, date) tags['sentry:dist'] = dist.name else: dist = None event_user = self._get_event_user(project, data) if event_user: # dont allow a conflicting 'user' tag if 'user' in tags: del tags['user'] tags['sentry:user'] = event_user.tag_value # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. normalize_in_app(data) for plugin in plugins.for_project(project, version=None): added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False) if added_tags: # plugins should not override user provided tags for key, value in added_tags: tags.setdefault(key, value) for path, iface in six.iteritems(event.interfaces): for k, v in iface.iter_tags(): tags[k] = v # Get rid of ephemeral interface data if iface.ephemeral: data.pop(iface.get_path(), None) # tags are stored as a tuple tags = tags.items() data['tags'] = tags data['fingerprint'] = fingerprint or ['{{ default }}'] # prioritize fingerprint over checksum as its likely the client defaulted # a checksum whereas the fingerprint was explicit if fingerprint: hashes = [ md5_from_hash(h) for h in get_hashes_from_fingerprint(event, fingerprint) ] elif checksum: if HASH_RE.match(checksum): hashes = [checksum] else: hashes = [md5_from_hash([checksum]), checksum] data['checksum'] = checksum else: hashes = [md5_from_hash(h) for h in get_hashes_for_event(event)] # TODO(dcramer): temp workaround for complexity data['message'] = message event_type = eventtypes.get(data.get('type', 'default'))(data) event_metadata = event_type.get_metadata() # TODO(dcramer): temp workaround for complexity del data['message'] data['type'] = event_type.key data['metadata'] = event_metadata # index components into ``Event.message`` # See GH-3248 if event_type.key != 'default': if 'sentry.interfaces.Message' in data and \ data['sentry.interfaces.Message']['message'] != message: message = u'{} {}'.format( message, data['sentry.interfaces.Message']['message'], ) if not message: message = '' elif not isinstance(message, six.string_types): message = force_text(message) for value in six.itervalues(event_metadata): value_u = force_text(value, errors='replace') if value_u not in message: message = u'{} {}'.format(message, value_u) if culprit and culprit not in message: culprit_u = force_text(culprit, errors='replace') message = u'{} {}'.format(message, culprit_u) message = trim(message.strip(), settings.SENTRY_MAX_MESSAGE_LENGTH) event.message = message kwargs['message'] = message received_timestamp = event.data.get('received') or float( event.datetime.strftime('%s')) group_kwargs = kwargs.copy() group_kwargs.update({ 'culprit': culprit, 'logger': logger_name, 'level': level, 'last_seen': date, 'first_seen': date, 'active_at': date, 'data': { 'last_received': received_timestamp, 'type': event_type.key, # we cache the events metadata on the group to ensure its # accessible in the stream 'metadata': event_metadata, }, }) if release: group_kwargs['first_release'] = release try: group, is_new, is_regression, is_sample = self._save_aggregate( event=event, hashes=hashes, release=release, **group_kwargs) except HashDiscarded: event_discarded.send_robust( project=project, sender=EventManager, ) metrics.incr( 'events.discarded', skip_internal=True, tags={ 'organization_id': project.organization_id, 'platform': platform, }, ) raise else: event_saved.send_robust( project=project, sender=EventManager, ) event.group = group # store a reference to the group id to guarantee validation of isolation event.data.bind_ref(event) # When an event was sampled, the canonical source of truth # is the EventMapping table since we aren't going to be writing out an actual # Event row. Otherwise, if the Event isn't being sampled, we can safely # rely on the Event table itself as the source of truth and ignore # EventMapping since it's redundant information. if is_sample: try: with transaction.atomic( using=router.db_for_write(EventMapping)): EventMapping.objects.create(project=project, group=group, event_id=event_id) except IntegrityError: self.logger.info('duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': EventMapping.__name__, }) return event environment = Environment.get_or_create( project=project, name=environment, ) group_environment, is_new_group_environment = GroupEnvironment.get_or_create( group_id=group.id, environment_id=environment.id, defaults={ 'first_release_id': release.id if release else None, }, ) if release: ReleaseEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) ReleaseProjectEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) grouprelease = GroupRelease.get_or_create( group=group, release=release, environment=environment, datetime=date, ) counters = [ (tsdb.models.group, group.id), (tsdb.models.project, project.id), ] if release: counters.append((tsdb.models.release, release.id)) tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id) frequencies = [ # (tsdb.models.frequent_projects_by_organization, { # project.organization_id: { # project.id: 1, # }, # }), # (tsdb.models.frequent_issues_by_project, { # project.id: { # group.id: 1, # }, # }) (tsdb.models.frequent_environments_by_group, { group.id: { environment.id: 1, }, }) ] if release: frequencies.append((tsdb.models.frequent_releases_by_group, { group.id: { grouprelease.id: 1, }, })) tsdb.record_frequency_multi(frequencies, timestamp=event.datetime) UserReport.objects.filter( project=project, event_id=event_id, ).update( group=group, environment=environment, ) # save the event unless its been sampled if not is_sample: try: with transaction.atomic(using=router.db_for_write(Event)): event.save() except IntegrityError: self.logger.info('duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': Event.__name__, }) return event index_event_tags.delay( organization_id=project.organization_id, project_id=project.id, group_id=group.id, environment_id=environment.id, event_id=event.id, tags=tags, date_added=event.datetime, ) if event_user: tsdb.record_multi( ( (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )), (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )), ), timestamp=event.datetime, environment_id=environment.id, ) if release: if is_new: buffer.incr(ReleaseProject, {'new_groups': 1}, { 'release_id': release.id, 'project_id': project.id, }) if is_new_group_environment: buffer.incr(ReleaseProjectEnvironment, {'new_issues_count': 1}, { 'project_id': project.id, 'release_id': release.id, 'environment_id': environment.id, }) safe_execute(Group.objects.add_tags, group, environment, tags, _with_transaction=False) if not raw: if not project.first_event: project.update(first_event=date) first_event_received.send(project=project, group=group, sender=Project) post_process_group.delay( group=group, event=event, is_new=is_new, is_sample=is_sample, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=hashes[0], ) else: self.logger.info('post_process.skip.raw_event', extra={'event_id': event.id}) metrics.timing( 'events.latency', received_timestamp - recorded_timestamp, tags={ 'project_id': project.id, }, ) return event
def timer(name, prefix='snuba.client'): t = time.time() try: yield finally: metrics.timing('{}.{}'.format(prefix, name), time.time() - t)
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, max_hits=None, ): now = timezone.now() end = None end_params = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA metrics.incr("snuba.search.postgres_only") # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in self.postgres_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( _f for _f in [retention_window_start, now - timedelta(days=90)] if _f) start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max(_f for _f in start_params if _f) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") with sentry_sdk.start_span(op="snuba_group_query") as span: group_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) span.set_data("Max Candidates", max_candidates) span.set_data("Result Size", len(group_ids)) metrics.timing("snuba.search.num_candidates", len(group_ids)) too_many_candidates = False if not group_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return self.empty_result elif len(group_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'status', # 'bookmarked_by', 'assigned_to', 'unassigned', or 'subscribed_by') # then it might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True group_ids = [] sort_field = self.sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = self.calculate_hits( group_ids, too_many_candidates, sort_field, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, start, end, ) if count_hits and hits == 0: return self.empty_result paginator_results = self.empty_result result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have group_ids always query for at least that many items chunk_limit = max(chunk_limit, len(group_ids)) # {group_id: group_score, ...} snuba_groups, total = self.snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], organization_id=projects[0].organization_id, sort_field=sort_field, cursor=cursor, group_ids=group_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if group_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the group_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits, max_hits=max_hits) if group_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results