def test_simple(self): event_id = "a" * 32 event_id_2 = "b" * 32 project = self.create_project() node_id = Event.generate_node_id(project.id, event_id) node_id_2 = Event.generate_node_id(project.id, event_id_2) event = self.store_event( data={ "event_id": event_id, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group1"], }, project_id=project.id, ) self.store_event( data={ "event_id": event_id_2, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group1"], }, project_id=project.id, ) group = event.group group.update(status=GroupStatus.PENDING_DELETION) GroupAssignee.objects.create(group=group, project=project, user=self.user) GroupHash.objects.create(project=project, group=group, hash=uuid4().hex) GroupMeta.objects.create(group=group, key="foo", value="bar") GroupRedirect.objects.create(group_id=group.id, previous_group_id=1) assert nodestore.get(node_id) assert nodestore.get(node_id_2) with self.tasks(): delete_groups(object_ids=[group.id]) assert not GroupRedirect.objects.filter(group_id=group.id).exists() assert not GroupHash.objects.filter(group_id=group.id).exists() assert not Group.objects.filter(id=group.id).exists() assert not nodestore.get(node_id) assert not nodestore.get(node_id_2)
def tombstone_events(project_id, group_id, event_ids): """ Delete associated per-event data: nodestore, event attachments, user reports. Mark the event as "tombstoned" in Snuba. This is not full event deletion. Snuba can still only delete entire groups, however we must only run this task for event IDs that we don't intend to reuse for reprocessed events. An event ID that is once tombstoned cannot be inserted over in eventstream. See doccomment in sentry.reprocessing2. """ from sentry.reprocessing2 import delete_unprocessed_events models.EventAttachment.objects.filter(project_id=project_id, event_id__in=event_ids).delete() models.UserReport.objects.filter(project_id=project_id, event_id__in=event_ids).delete() # Remove from nodestore node_ids = [ Event.generate_node_id(project_id, event_id) for event_id in event_ids ] nodestore.delete_multi(node_ids) delete_unprocessed_events(project_id, event_ids) # Tell Snuba to delete the event data. eventstream.tombstone_events(project_id, event_ids)
def _process_snuba_results(query_res, group: Group, user): event_ids = { row["latest_event_id"]: Event.generate_node_id(group.project_id, row["latest_event_id"]) for row in query_res } node_data = nodestore.get_multi(list(event_ids.values())) response = [] for row in query_res: response_item = { "hash": row["new_materialized_hash"], "eventCount": row["event_count"], } event_id = row["latest_event_id"] event_data = node_data.get(event_ids[event_id], None) if event_data is not None: event = Event(group.project_id, event_id, group_id=group.id, data=event_data) response_item["latestEvent"] = serialize(event, user, EventSerializer()) response.append(response_item) return response
def handle_remaining_events(project_id, new_group_id, event_ids, remaining_events, from_timestamp, to_timestamp): """ Delete or merge/move associated per-event data: nodestore, event attachments, user reports. Mark the event as "tombstoned" in Snuba. This is not full event deletion. Snuba can still only delete entire groups, however we must only run this task for event IDs that we don't intend to reuse for reprocessed events. An event ID that is once tombstoned cannot be inserted over in eventstream. See doc comment in sentry.reprocessing2. """ from sentry import buffer from sentry.models.group import Group from sentry.reprocessing2 import EVENT_MODELS_TO_MIGRATE assert remaining_events in ("delete", "keep") if remaining_events == "delete": for cls in EVENT_MODELS_TO_MIGRATE: cls.objects.filter(project_id=project_id, event_id__in=event_ids).delete() # Remove from nodestore node_ids = [ Event.generate_node_id(project_id, event_id) for event_id in event_ids ] nodestore.delete_multi(node_ids) # Tell Snuba to delete the event data. eventstream.tombstone_events_unsafe(project_id, event_ids, from_timestamp=from_timestamp, to_timestamp=to_timestamp) elif remaining_events == "keep": for cls in EVENT_MODELS_TO_MIGRATE: cls.objects.filter( project_id=project_id, event_id__in=event_ids).update(group_id=new_group_id) eventstream.replace_group_unsafe( project_id, event_ids, new_group_id=new_group_id, from_timestamp=from_timestamp, to_timestamp=to_timestamp, ) buffer.incr(Group, {"times_seen": len(event_ids)}, {"id": new_group_id}) else: raise ValueError( f"Invalid value for remaining_events: {remaining_events}")
def fetch_and_store(line): project_id, event_id = line.strip().split("\t") node_id = Event.generate_node_id(project_id, event_id) node = nodestore.get(node_id) # pylint: disable=no-member if node is None: print("WARNING: Got None from nodestore for project / event", project_id, event_id, file=sys.stderr) else: store(project_id, event_id, node, global_output_dir)
def test_simple(self): configure_sdk() Hub.current.bind_client(Hub.main.client) with self.tasks(): event_id = raven.captureMessage("internal client test") event = nodestore.get(Event.generate_node_id(settings.SENTRY_PROJECT, event_id)) assert event["project"] == settings.SENTRY_PROJECT assert event["event_id"] == event_id assert event["logentry"]["formatted"] == "internal client test"
def chunk(self): conditions = [] if self.last_event is not None: conditions.extend( [ ["timestamp", "<=", self.last_event.timestamp], [ ["timestamp", "<", self.last_event.timestamp], ["event_id", "<", self.last_event.event_id], ], ] ) events = eventstore.get_unfetched_events( filter=eventstore.Filter( conditions=conditions, project_ids=[self.project_id], group_ids=[self.group_id] ), limit=self.DEFAULT_CHUNK_SIZE, referrer="deletions.group", orderby=["-timestamp", "-event_id"], ) if not events: return False self.last_event = events[-1] # Remove from nodestore node_ids = [Event.generate_node_id(self.project_id, event.event_id) for event in events] nodestore.delete_multi(node_ids) from sentry.reprocessing2 import delete_unprocessed_events delete_unprocessed_events(events) # Remove EventAttachment and UserReport *again* as those may not have a # group ID, therefore there may be dangling ones after "regular" model # deletion. event_ids = [event.event_id for event in events] models.EventAttachment.objects.filter( event_id__in=event_ids, project_id=self.project_id ).delete() models.UserReport.objects.filter( event_id__in=event_ids, project_id=self.project_id ).delete() return True
def test_encoding(self): configure_sdk() Hub.current.bind_client(Hub.main.client) class NotJSONSerializable: pass with self.tasks(): event_id = raven.captureMessage( "check the req", extra={"request": NotJSONSerializable()} ) event = nodestore.get(Event.generate_node_id(settings.SENTRY_PROJECT, event_id)) assert event["project"] == settings.SENTRY_PROJECT assert event["logentry"]["formatted"] == "check the req" assert "NotJSONSerializable" in event["extra"]["request"]
def test_recursion_breaker(self): configure_sdk() Hub.current.bind_client(Hub.main.client) # If this test terminates at all then we avoided recursion. with self.tasks(): with mock.patch( "sentry.event_manager.EventManager.save", side_effect=ValueError("oh no!") ) as save: event_id = raven.captureMessage("internal client test") event = nodestore.get(Event.generate_node_id(settings.SENTRY_PROJECT, event_id)) assert event is None assert_mock_called_once_with_partial( save, settings.SENTRY_PROJECT, cache_key=u"e:{}:1".format(event_id) )
def test_dupe_message_id(self, eventstream_insert): # Saves the latest event to nodestore and eventstream project_id = 1 event_id = "a" * 32 node_id = Event.generate_node_id(project_id, event_id) manager = EventManager(make_event(event_id=event_id, message="first")) manager.normalize() manager.save(project_id) assert nodestore.get(node_id)["logentry"]["formatted"] == "first" manager = EventManager(make_event(event_id=event_id, message="second")) manager.normalize() manager.save(project_id) assert nodestore.get(node_id)["logentry"]["formatted"] == "second" assert eventstream_insert.call_count == 2
def get(self, request: Request, organization) -> Response: """ Generate a list of data scrubbing selectors from existing event data. This list is used to auto-complete settings in "Data Scrubbing" / "Security and Privacy" settings. """ event_id = request.GET.get("eventId", None) # For organization settings we access all projects the user has access # to. For the project level, `get_projects` will give us back a single # project. # # Filtering by the projects that self.get_projects returns deals with # permission concerns. # # The org-wide search for the event ID is quite slow, but we cannot fix # that without product redesign. projects = self.get_projects(request, organization) project_ids = [project.id for project in projects] suggestions = {} if event_id: # go to nodestore directly instead of eventstore.get_events, which # would not return transaction events node_ids = [ Event.generate_node_id(p, event_id) for p in project_ids ] all_data = nodestore.get_multi(node_ids) for data in filter(None, all_data.values()): for selector in pii_selector_suggestions_from_event(data): examples_ = suggestions.setdefault(selector["path"], []) if selector["value"]: examples_.append(selector["value"]) return Response({ "suggestions": [{ "type": "value", "value": value, "examples": examples } for value, examples in suggestions.items()] })
def handle_remaining_events(project_id, new_group_id, event_ids, remaining_events, from_timestamp, to_timestamp): """ Delete or merge/move associated per-event data: nodestore, event attachments, user reports. Mark the event as "tombstoned" in Snuba. This is not full event deletion. Snuba can still only delete entire groups, however we must only run this task for event IDs that we don't intend to reuse for reprocessed events. An event ID that is once tombstoned cannot be inserted over in eventstream. See doccomment in sentry.reprocessing2. """ assert remaining_events in ("delete", "keep") if remaining_events == "delete": models.EventAttachment.objects.filter(project_id=project_id, event_id__in=event_ids).delete() models.UserReport.objects.filter(project_id=project_id, event_id__in=event_ids).delete() # Remove from nodestore node_ids = [ Event.generate_node_id(project_id, event_id) for event_id in event_ids ] nodestore.delete_multi(node_ids) # Tell Snuba to delete the event data. eventstream.tombstone_events_unsafe(project_id, event_ids, from_timestamp=from_timestamp, to_timestamp=to_timestamp) elif remaining_events == "keep": eventstream.replace_group_unsafe( project_id, event_ids, new_group_id=new_group_id, from_timestamp=from_timestamp, to_timestamp=to_timestamp, ) else: raise ValueError( f"Invalid value for remaining_events: {remaining_events}")
def _process_snuba_results(query_res, group: Group, id: int, user): event_ids = { row["latest_event_id"]: Event.generate_node_id(group.project_id, row["latest_event_id"]) for row in query_res } node_data = nodestore.get_multi(list(event_ids.values())) response = [] for row in query_res: response_item = { "hash": row["new_materialized_hash"], "eventCount": row["event_count"], } event_id = row["latest_event_id"] event_data = node_data.get(event_ids[event_id], None) if event_data is not None: event = Event(group.project_id, event_id, group_id=group.id, data=event_data) response_item["latestEvent"] = serialize(event, user, EventSerializer()) tree_label = get_path( event_data, "hierarchical_tree_labels", id) or get_path( event_data, "hierarchical_tree_labels", -1) # Rough approximation of what happens with Group title event_type = get_event_type(event.data) metadata = dict(event.get_event_metadata()) metadata["current_tree_label"] = tree_label # Force rendering of grouping tree labels irrespective of platform metadata["display_title_with_tree_label"] = True title = event_type.get_title(metadata) response_item["title"] = title or event.title response_item["metadata"] = metadata response.append(response_item) return response
def chunk(self): conditions = [] if self.last_event is not None: conditions.extend([ ["timestamp", "<=", self.last_event.timestamp], [ ["timestamp", "<", self.last_event.timestamp], ["event_id", "<", self.last_event.event_id], ], ]) events = eventstore.get_unfetched_events( filter=eventstore.Filter(conditions=conditions, project_ids=[self.project_id], group_ids=[self.group_id]), limit=self.DEFAULT_CHUNK_SIZE, referrer="deletions.group", orderby=["-timestamp", "-event_id"], ) if not events: return False self.last_event = events[-1] # Remove from nodestore node_ids = [ Event.generate_node_id(self.project_id, event.event_id) for event in events ] nodestore.delete_multi(node_ids) delete_unprocessed_events(events) # Remove EventAttachment and UserReport event_ids = [event.event_id for event in events] EventAttachment.objects.filter(event_id__in=event_ids, project_id=self.project_id).delete() UserReport.objects.filter(event_id__in=event_ids, project_id=self.project_id).delete() return True
def pull_event_data(project_id, event_id) -> ReprocessableEvent: from sentry.lang.native.processing import get_required_attachment_types with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: raise CannotReprocess("event.not_found") with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id, subkey="unprocessed") if data is None: node_id = _generate_unprocessed_event_node_id( project_id=project_id, event_id=event_id) data = nodestore.get(node_id) # Check data after checking presence of event to avoid too many instances. if data is None: raise CannotReprocess("unprocessed_event.not_found") required_attachment_types = get_required_attachment_types(data) attachments = list( models.EventAttachment.objects.filter( project_id=project_id, event_id=event_id, type__in=list(required_attachment_types))) missing_attachment_types = required_attachment_types - { ea.type for ea in attachments } if missing_attachment_types: raise CannotReprocess("attachment.not_found") return ReprocessableEvent(event=event, data=data, attachments=attachments)
def reprocess_event(project_id, event_id, start_time): from sentry.tasks.store import preprocess_event_from_reprocessing from sentry.ingest.ingest_consumer import CACHE_TIMEOUT with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id, subkey="unprocessed") if data is None: node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) data = nodestore.get(node_id) with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: logger.error( "reprocessing2.event.not_found", extra={"project_id": project_id, "event_id": event_id} ) return if data is None: logger.error( "reprocessing2.reprocessing_nodestore.not_found", extra={"project_id": project_id, "event_id": event_id}, ) # We have no real data for reprocessing. We assume this event goes # straight to save_event, and hope that the event data can be # reingested like that. It's better than data loss. # # XXX: Ideally we would run a "save-lite" for this that only updates # the group ID in-place. Like a snuba merge message. data = dict(event.data) # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache queryset = models.EventAttachment.objects.filter(project_id=project_id, event_id=event_id) files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in queryset])} attachment_objects = [] for attachment_id, attachment in enumerate(queryset): with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, file=files[attachment.file_id], cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, ) ) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing( cache_key=cache_key, start_time=start_time, event_id=event_id )
def setUp(self): super(DeleteGroupTest, self).setUp() self.event_id = "a" * 32 self.event_id2 = "b" * 32 self.event_id3 = "c" * 32 self.project = self.create_project() self.event = self.store_event( data={ "event_id": self.event_id, "tags": { "foo": "bar" }, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group1"], }, project_id=self.project.id, ) self.store_event( data={ "event_id": self.event_id2, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group1"], }, project_id=self.project.id, ) self.store_event( data={ "event_id": self.event_id3, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group2"], }, project_id=self.project.id, ) group = self.event.group UserReport.objects.create(group_id=group.id, project_id=self.event.project_id, name="With group id") UserReport.objects.create(event_id=self.event.event_id, project_id=self.event.project_id, name="With event id") EventAttachment.objects.create( event_id=self.event.event_id, project_id=self.event.project_id, file=File.objects.create(name="hello.png", type="image/png"), name="hello.png", ) GroupAssignee.objects.create(group=group, project=self.project, user=self.user) GroupHash.objects.create(project=self.project, group=group, hash=uuid4().hex) GroupMeta.objects.create(group=group, key="foo", value="bar") GroupRedirect.objects.create(group_id=group.id, previous_group_id=1) self.node_id = Event.generate_node_id(self.project.id, self.event_id) self.node_id2 = Event.generate_node_id(self.project.id, self.event_id2) self.node_id3 = Event.generate_node_id(self.project.id, self.event_id3)
def reprocess_event(project_id, event_id, start_time): from sentry.ingest.ingest_consumer import CACHE_TIMEOUT from sentry.lang.native.processing import get_required_attachment_types from sentry.tasks.store import preprocess_event_from_reprocessing with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id, subkey="unprocessed") if data is None: node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) data = nodestore.get(node_id) if data is None: raise CannotReprocess("reprocessing_nodestore.not_found") with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: raise CannotReprocess("event.not_found") required_attachment_types = get_required_attachment_types(data) attachments = list( models.EventAttachment.objects.filter( project_id=project_id, event_id=event_id, type__in=list(required_attachment_types) ) ) missing_attachment_types = required_attachment_types - {ea.type for ea in attachments} if missing_attachment_types: raise CannotReprocess( f"attachment.not_found.{'_and_'.join(sorted(missing_attachment_types))}" ) # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id) set_path( data, "contexts", "reprocessing", "original_primary_hash", value=event.get_primary_hash() ) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache. Note that we can only # consider minidumps because filestore just stays as-is after reprocessing # (we simply update group_id on the EventAttachment models in post_process) attachment_objects = [] files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in attachments])} for attachment_id, attachment in enumerate(attachments): with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, file=files[attachment.file_id], cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, ) ) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing( cache_key=cache_key, start_time=start_time, event_id=event_id, data=data, )
def handle_remaining_events( project_id, new_group_id, remaining_events, # TODO(markus): Should be mandatory arguments. event_ids_redis_key=None, old_group_id=None, # TODO(markus): Deprecated arguments, can remove in next version. event_ids=None, from_timestamp=None, to_timestamp=None, ): """ Delete or merge/move associated per-event data: nodestore, event attachments, user reports. Mark the event as "tombstoned" in Snuba. This is not full event deletion. Snuba can still only delete entire groups, however we must only run this task for event IDs that we don't intend to reuse for reprocessed events. An event ID that is once tombstoned cannot be inserted over in eventstream. See doc comment in sentry.reprocessing2. """ from sentry import buffer from sentry.models.group import Group from sentry.reprocessing2 import EVENT_MODELS_TO_MIGRATE, pop_batched_events_from_redis if event_ids_redis_key is not None: event_ids, from_timestamp, to_timestamp = pop_batched_events_from_redis( event_ids_redis_key) metrics.timing( "events.reprocessing.handle_remaining_events.batch_size", len(event_ids), sample_rate=1.0, ) assert remaining_events in ("delete", "keep") if remaining_events == "delete": for cls in EVENT_MODELS_TO_MIGRATE: cls.objects.filter(project_id=project_id, event_id__in=event_ids).delete() # Remove from nodestore node_ids = [ Event.generate_node_id(project_id, event_id) for event_id in event_ids ] nodestore.delete_multi(node_ids) # Tell Snuba to delete the event data. eventstream.tombstone_events_unsafe(project_id, event_ids, from_timestamp=from_timestamp, to_timestamp=to_timestamp) elif remaining_events == "keep": for cls in EVENT_MODELS_TO_MIGRATE: cls.objects.filter( project_id=project_id, event_id__in=event_ids).update(group_id=new_group_id) eventstream.replace_group_unsafe( project_id, event_ids, new_group_id=new_group_id, from_timestamp=from_timestamp, to_timestamp=to_timestamp, ) buffer.incr(Group, {"times_seen": len(event_ids)}, {"id": new_group_id}) else: raise ValueError( f"Invalid value for remaining_events: {remaining_events}") if old_group_id is not None: from sentry.reprocessing2 import mark_event_reprocessed mark_event_reprocessed(group_id=old_group_id, project_id=project_id, num_events=len(event_ids))
def capture_nodestore_stats(cache_key, project_id, event_id): set_current_project(project_id) from sentry.eventstore.compressor import deduplicate from sentry.eventstore.models import Event node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id) if not data: metrics.incr("eventstore.compressor.error", tags={"reason": "no_data"}) return old_event_size = _json_size(data) unprocessed_data = event_processing_store.get( _get_unprocessed_key(cache_key)) event_processing_store.delete_by_key(_get_unprocessed_key(cache_key)) tags = { "with_reprocessing": bool(unprocessed_data), "platform": data.get("platform") or "none", "is_minidump": is_minidump_event(data), } if unprocessed_data: metrics.incr("nodestore_stats.with_reprocessing") concatenated_size = _json_size(data, unprocessed_data) metrics.timing("events.size.concatenated", concatenated_size, tags=tags) metrics.timing("events.size.concatenated.ratio", concatenated_size / old_event_size, tags=tags) _data = dict(data) _data["__nodestore_reprocessing"] = unprocessed_data simple_concatenated_size = _json_size(_data) metrics.timing("events.size.simple_concatenated", simple_concatenated_size, tags=tags) metrics.timing( "events.size.simple_concatenated.ratio", simple_concatenated_size / old_event_size, tags=tags, ) else: metrics.incr("nodestore_stats.without_reprocessing") new_data, extra_keys = deduplicate(dict(data)) total_size = event_size = _json_size(new_data) for key, value in six.iteritems(extra_keys): if nodestore.get(key) is not None: metrics.incr("eventstore.compressor.hits", tags=tags) # do not continue, nodestore.set() should bump TTL else: metrics.incr("eventstore.compressor.misses", tags=tags) total_size += _json_size(value) # key is md5sum of content # do not store actual value to keep prod impact to a minimum nodestore.set(key, {}) metrics.timing("events.size.deduplicated", event_size, tags=tags) metrics.timing("events.size.deduplicated.total_written", total_size, tags=tags) metrics.timing("events.size.deduplicated.ratio", event_size / old_event_size, tags=tags) metrics.timing("events.size.deduplicated.total_written.ratio", total_size / old_event_size, tags=tags) if total_size > old_event_size: nodestore_stats_logger.info( "events.size.deduplicated.details", extra={ "project_id": project_id, "event_id": event_id, "total_size": total_size, "old_event_size": old_event_size, }, )