def insert_data_to_database_legacy(data, start_time=None, from_reprocessing=False, attachments=None): """ Yet another "fast path" to ingest an event without making it go through Relay. Please consider using functions from the ingest consumer instead, or, if you're within tests, to use `TestCase.store_event`. """ # XXX(markus): Delete this function and merge with ingest consumer logic. if start_time is None: start_time = time() # we might be passed some subclasses of dict that fail dumping if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) cache_timeout = 3600 cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, cache_timeout) # Attachments will be empty or None if the "event-attachments" feature # is turned off. For native crash reports it will still contain the # crash dump (e.g. minidump) so we can load it during processing. if attachments is not None: attachment_cache.set(cache_key, attachments, cache_timeout) task = from_reprocessing and preprocess_event_from_reprocessing or preprocess_event task.delay(cache_key=cache_key, start_time=start_time, event_id=data["event_id"])
def insert_data_to_database(self, data, start_time=None, from_reprocessing=False, attachments=None): if start_time is None: start_time = time() # we might be passed some sublcasses of dict that fail dumping if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) cache_timeout = 3600 cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, cache_timeout) # Attachments will be empty or None if the "event-attachments" feature # is turned off. For native crash reports it will still contain the # crash dump (e.g. minidump) so we can load it during processing. if attachments is not None: attachment_cache.set(cache_key, attachments, cache_timeout) task = from_reprocessing and \ preprocess_event_from_reprocessing or preprocess_event task.delay(cache_key=cache_key, start_time=start_time, event_id=data['event_id'])
def dispatch_task(cache_key: str) -> None: if attachments: with sentry_sdk.start_span(op="ingest_consumer.set_attachment_cache"): attachment_objects = [ CachedAttachment(type=attachment.pop("attachment_type"), **attachment) for attachment in attachments ] attachment_cache.set( cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT ) # Preprocess this event, which spawns either process_event or # save_event. Pass data explicitly to avoid fetching it again from the # cache. with sentry_sdk.start_span(op="ingest_consumer.process_event.preprocess_event"): preprocess_event( cache_key=cache_key, data=data, start_time=start_time, event_id=event_id, project=project, ) # remember for an 1 hour that we saved this event (deduplication protection) cache.set(deduplication_key, "", CACHE_TIMEOUT) # emit event_accepted once everything is done event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event)
def test_attachment_outcomes(self): manager = EventManager(make_event(message="foo"), project=self.project) manager.normalize() a1 = CachedAttachment(name="a1", data=b"hello") a2 = CachedAttachment(name="a2", data=b"limited", rate_limited=True) a3 = CachedAttachment(name="a3", data=b"world") cache_key = cache_key_for_event(manager.get_data()) attachment_cache.set(cache_key, attachments=[a1, a2, a3]) mock_track_outcome = mock.Mock() with mock.patch("sentry.event_manager.track_outcome", mock_track_outcome): with self.feature("organizations:event-attachments"): manager.save(1, cache_key=cache_key) assert mock_track_outcome.call_count == 3 for o in mock_track_outcome.mock_calls: assert o.kwargs["outcome"] == Outcome.ACCEPTED for o in mock_track_outcome.mock_calls[:2]: assert o.kwargs["category"] == DataCategory.ATTACHMENT assert o.kwargs["quantity"] == 5 final = mock_track_outcome.mock_calls[2] assert final.kwargs["category"] == DataCategory.DEFAULT
def insert_data_to_database(self, data, start_time=None, from_reprocessing=False, attachments=None): if start_time is None: start_time = time() # we might be passed some subclasses of dict that fail dumping if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) cache_timeout = 3600 cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, cache_timeout) # Attachments will be empty or None if the "event-attachments" feature # is turned off. For native crash reports it will still contain the # crash dump (e.g. minidump) so we can load it during processing. if attachments is not None: attachment_cache.set(cache_key, attachments, cache_timeout) # NOTE: Project is bound to the context in most cases in production, which # is enough for us to do `projects:kafka-ingest` testing. project = self.context and self.context.project if project and features.has('projects:kafka-ingest', project=project): kafka.produce_sync( settings.KAFKA_PREPROCESS, value=json.dumps({ 'cache_key': cache_key, 'start_time': start_time, 'from_reprocessing': from_reprocessing, 'data': data, }), ) else: task = from_reprocessing and \ preprocess_event_from_reprocessing or preprocess_event task.delay(cache_key=cache_key, start_time=start_time, event_id=data['event_id'])
def insert_data_to_database(self, data, start_time=None, from_reprocessing=False, attachments=None): if start_time is None: start_time = time() # we might be passed some sublcasses of dict that fail dumping if isinstance(data, DOWNGRADE_DATA_TYPES): data = dict(data.items()) cache_timeout = 3600 cache_key = u'e:{1}:{0}'.format(data['project'], data['event_id']) default_cache.set(cache_key, data, cache_timeout) # Attachments will be empty or None if the "event-attachments" feature # is turned off. For native crash reports it will still contain the # crash dump (e.g. minidump) so we can load it during processing. if attachments is not None: attachment_cache.set(cache_key, attachments, cache_timeout) task = from_reprocessing and \ preprocess_event_from_reprocessing or preprocess_event task.delay(cache_key=cache_key, start_time=start_time, event_id=data['event_id'])
def reprocess_event(project_id, event_id, start_time): from sentry.event_manager import set_tag from sentry.tasks.store import preprocess_event_from_reprocessing from sentry.ingest.ingest_consumer import CACHE_TIMEOUT # Take unprocessed data from old event and save it as unprocessed data # under a new event ID. The second step happens in pre-process. We could # save the "original event ID" instead and get away with writing less to # nodestore, but doing it this way makes the logic slightly simpler. node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): data = nodestore.get(node_id) with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: logger.error("reprocessing2.event.not_found", extra={ "project_id": project_id, "event_id": event_id }) return if data is None: logger.error( "reprocessing2.reprocessing_nodestore.not_found", extra={ "project_id": project_id, "event_id": event_id }, ) # We have no real data for reprocessing. We assume this event goes # straight to save_event, and hope that the event data can be # reingested like that. It's better than data loss. # # XXX: Ideally we would run a "save-lite" for this that only updates # the group ID in-place. Like a snuba merge message. data = dict(event.data) # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_tag(data, "original_group_id", event.group_id) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache queryset = models.EventAttachment.objects.filter( project_id=project_id, event_id=event_id).select_related("file") attachment_objects = [] for attachment_id, attachment in enumerate(queryset): with sentry_sdk.start_span( op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, )) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing(cache_key=cache_key, start_time=start_time, event_id=event_id)
def process_event(message, projects): payload = message["payload"] start_time = float(message["start_time"]) event_id = message["event_id"] project_id = int(message["project_id"]) remote_addr = message.get("remote_addr") attachments = message.get("attachments") or () # check that we haven't already processed this event (a previous instance of the forwarder # died before it could commit the event queue offset) deduplication_key = "ev:{}:{}".format(project_id, event_id) if cache.get(deduplication_key) is not None: logger.warning( "pre-process-forwarder detected a duplicated event" " with id:%s for project:%s.", event_id, project_id, ) return # message already processed do not reprocess try: project = projects[project_id] except KeyError: logger.error("Project for ingested event does not exist: %s", project_id) return # Parse the JSON payload. This is required to compute the cache key and # call process_event. The payload will be put into Kafka raw, to avoid # serializing it again. # XXX: Do not use CanonicalKeyDict here. This may break preprocess_event # which assumes that data passed in is a raw dictionary. data = json.loads(payload) cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, CACHE_TIMEOUT) if attachments: attachment_objects = [ CachedAttachment(type=attachment.pop("attachment_type"), **attachment) for attachment in attachments ] attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) # Preprocess this event, which spawns either process_event or # save_event. Pass data explicitly to avoid fetching it again from the # cache. preprocess_event(cache_key=cache_key, data=data, start_time=start_time, event_id=event_id, project=project) # remember for an 1 hour that we saved this event (deduplication protection) cache.set(deduplication_key, "", CACHE_TIMEOUT) # emit event_accepted once everything is done event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event)
def _do_process_event(message, projects): payload = message["payload"] start_time = float(message["start_time"]) event_id = message["event_id"] project_id = int(message["project_id"]) remote_addr = message.get("remote_addr") attachments = message.get("attachments") or () # check that we haven't already processed this event (a previous instance of the forwarder # died before it could commit the event queue offset) # # XXX(markus): I believe this code is extremely broken: # # * it practically uses memcached in prod which has no consistency # guarantees (no idea how we don't run into issues there) # # * a TTL of 1h basically doesn't guarantee any deduplication at all. It # just guarantees a good error message... for one hour. # # This code has been ripped from the old python store endpoint. We're # keeping it around because it does provide some protection against # reprocessing good events if a single consumer is in a restart loop. deduplication_key = f"ev:{project_id}:{event_id}" if cache.get(deduplication_key) is not None: logger.warning( "pre-process-forwarder detected a duplicated event" " with id:%s for project:%s.", event_id, project_id, ) return # message already processed do not reprocess try: project = projects[project_id] except KeyError: logger.error("Project for ingested event does not exist: %s", project_id) return # Parse the JSON payload. This is required to compute the cache key and # call process_event. The payload will be put into Kafka raw, to avoid # serializing it again. # XXX: Do not use CanonicalKeyDict here. This may break preprocess_event # which assumes that data passed in is a raw dictionary. data = json.loads(payload) cache_key = event_processing_store.store(data) if attachments: attachment_objects = [ CachedAttachment(type=attachment.pop("attachment_type"), **attachment) for attachment in attachments ] attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) # Preprocess this event, which spawns either process_event or # save_event. Pass data explicitly to avoid fetching it again from the # cache. with sentry_sdk.start_span(op="ingest_consumer.process_event.preprocess_event"): preprocess_event( cache_key=cache_key, data=data, start_time=start_time, event_id=event_id, project=project, ) # remember for an 1 hour that we saved this event (deduplication protection) cache.set(deduplication_key, "", CACHE_TIMEOUT) # emit event_accepted once everything is done event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event)
def reprocess_event(project_id, event_id, start_time): from sentry.tasks.store import preprocess_event_from_reprocessing from sentry.ingest.ingest_consumer import CACHE_TIMEOUT with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id, subkey="unprocessed") if data is None: node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) data = nodestore.get(node_id) with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: logger.error( "reprocessing2.event.not_found", extra={"project_id": project_id, "event_id": event_id} ) return if data is None: logger.error( "reprocessing2.reprocessing_nodestore.not_found", extra={"project_id": project_id, "event_id": event_id}, ) # We have no real data for reprocessing. We assume this event goes # straight to save_event, and hope that the event data can be # reingested like that. It's better than data loss. # # XXX: Ideally we would run a "save-lite" for this that only updates # the group ID in-place. Like a snuba merge message. data = dict(event.data) # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache queryset = models.EventAttachment.objects.filter(project_id=project_id, event_id=event_id) files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in queryset])} attachment_objects = [] for attachment_id, attachment in enumerate(queryset): with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, file=files[attachment.file_id], cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, ) ) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing( cache_key=cache_key, start_time=start_time, event_id=event_id )
def reprocess_event(project_id, event_id, start_time): from sentry.ingest.ingest_consumer import CACHE_TIMEOUT from sentry.lang.native.processing import get_required_attachment_types from sentry.tasks.store import preprocess_event_from_reprocessing with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id, subkey="unprocessed") if data is None: node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) data = nodestore.get(node_id) if data is None: raise CannotReprocess("reprocessing_nodestore.not_found") with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: raise CannotReprocess("event.not_found") required_attachment_types = get_required_attachment_types(data) attachments = list( models.EventAttachment.objects.filter( project_id=project_id, event_id=event_id, type__in=list(required_attachment_types) ) ) missing_attachment_types = required_attachment_types - {ea.type for ea in attachments} if missing_attachment_types: raise CannotReprocess( f"attachment.not_found.{'_and_'.join(sorted(missing_attachment_types))}" ) # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id) set_path( data, "contexts", "reprocessing", "original_primary_hash", value=event.get_primary_hash() ) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache. Note that we can only # consider minidumps because filestore just stays as-is after reprocessing # (we simply update group_id on the EventAttachment models in post_process) attachment_objects = [] files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in attachments])} for attachment_id, attachment in enumerate(attachments): with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, file=files[attachment.file_id], cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, ) ) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing( cache_key=cache_key, start_time=start_time, event_id=event_id, data=data, )
def reprocess_event(project_id, event_id, start_time): node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): data = nodestore.get(node_id) if data is None: return from sentry.event_manager import set_tag from sentry.tasks.store import preprocess_event_from_reprocessing from sentry.ingest.ingest_consumer import CACHE_TIMEOUT # Take unprocessed data from old event and save it as unprocessed data # under a new event ID. The second step happens in pre-process. We could # save the "original event ID" instead and get away with writing less to # nodestore, but doing it this way makes the logic slightly simpler. # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store orig_event_id = data["event_id"] set_tag(data, "original_event_id", orig_event_id) event = eventstore.get_event_by_id(project_id, orig_event_id) if event is None: return set_tag(data, "original_group_id", event.group_id) # XXX: reuse event IDs event_id = data["event_id"] = uuid.uuid4().hex cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache queryset = models.EventAttachment.objects.filter( project_id=project_id, event_id=orig_event_id).select_related("file") attachment_objects = [] for attachment_id, attachment in enumerate(queryset): with sentry_sdk.start_span( op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, )) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing(cache_key=cache_key, start_time=start_time, event_id=event_id)
def reprocess_event(project_id, event_id, start_time): from sentry.ingest.ingest_consumer import CACHE_TIMEOUT from sentry.tasks.store import preprocess_event_from_reprocessing reprocessable_event = pull_event_data(project_id, event_id) data = reprocessable_event.data event = reprocessable_event.event attachments = reprocessable_event.attachments # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id) set_path(data, "contexts", "reprocessing", "original_primary_hash", value=event.get_primary_hash()) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache. Note that we can only # consider minidumps because filestore just stays as-is after reprocessing # (we simply update group_id on the EventAttachment models in post_process) attachment_objects = [] files = { f.id: f for f in models.File.objects.filter( id__in=[ea.file_id for ea in attachments]) } for attachment_id, attachment in enumerate(attachments): with sentry_sdk.start_span( op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, file=files[attachment.file_id], cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, )) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing( cache_key=cache_key, start_time=start_time, event_id=event_id, data=data, )