def test_messages(self) -> None: processor = GroupAssigneeProcessor("sentry_groupasignee") metadata = KafkaMessageMetadata(offset=42, partition=0, timestamp=datetime(1970, 1, 1)) ret = processor.process_message(self.INSERT_MSG, metadata) assert ret == InsertBatch([self.PROCESSED]) self.write_processed_messages([ret]) ret = (get_cluster(StorageSetKey.EVENTS).get_query_connection( ClickhouseClientSettings.QUERY).execute( "SELECT * FROM groupassignee_local;")) assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id ) ret = processor.process_message(self.UPDATE_MSG_NO_KEY_CHANGE, metadata) assert ret == InsertBatch([self.PROCESSED]) # Tests an update with key change which becomes a two inserts: # one deletion and the insertion of the new row. ret = processor.process_message(self.UPDATE_MSG_WITH_KEY_CHANGE, metadata) assert ret == InsertBatch([self.DELETED, self.PROCESSED_UPDATE]) ret = processor.process_message(self.DELETE_MSG, metadata) assert ret == InsertBatch([self.DELETED])
def test_messages(self) -> None: processor = GroupedMessageProcessor("sentry_groupedmessage") metadata = KafkaMessageMetadata( offset=42, partition=0, timestamp=datetime(1970, 1, 1) ) ret = processor.process_message(self.INSERT_MSG, metadata) assert ret == InsertBatch([self.PROCESSED]) write_processed_messages(self.storage, [ret]) ret = ( get_cluster(StorageSetKey.EVENTS) .get_query_connection(ClickhouseClientSettings.INSERT) .execute("SELECT * FROM groupedmessage_local;") ) assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 74, # id 0, # status datetime(2019, 6, 19, 6, 46, 28), datetime(2019, 6, 19, 6, 45, 32), datetime(2019, 6, 19, 6, 45, 32), None, ) ret = processor.process_message(self.UPDATE_MSG, metadata) assert ret == InsertBatch([self.PROCESSED]) ret = processor.process_message(self.DELETE_MSG, metadata) assert ret == InsertBatch([self.DELETED])
def test_metrics_processor( message: Mapping[str, Any], expected_set: Optional[Sequence[Mapping[str, Any]]], expected_counter: Optional[Sequence[Mapping[str, Any]]], expected_distributions: Optional[Sequence[Mapping[str, Any]]], ) -> None: settings.DISABLED_DATASETS = set() meta = KafkaMessageMetadata(offset=100, partition=1, timestamp=datetime(1970, 1, 1)) expected_set_result = (InsertBatch(expected_set, None) if expected_set is not None else None) assert SetsMetricsProcessor().process_message(message, meta) == expected_set_result expected_counter_result = (InsertBatch(expected_counter, None) if expected_counter is not None else None) assert (CounterMetricsProcessor().process_message( message, meta) == expected_counter_result) expected_distributions_result = (InsertBatch(expected_distributions, None) if expected_distributions is not None else None) assert (DistributionsMetricsProcessor().process_message( message, meta) == expected_distributions_result)
def test_messages(self): processor = GroupedMessageProcessor("sentry_groupedmessage") message_filter = CdcTableNameMessageFilter(postgres_table=POSTGRES_TABLE) metadata = KafkaMessageMetadata( offset=42, partition=0, timestamp=datetime(1970, 1, 1) ) assert message_filter.should_drop(self.__make_msg(0, 42, self.BEGIN_MSG, [])) assert message_filter.should_drop(self.__make_msg(0, 42, self.COMMIT_MSG, [])) assert not message_filter.should_drop( self.__make_msg( 0, 42, self.INSERT_MSG, [("table", "sentry_groupedmessage".encode())] ) ) insert_msg = json.loads(self.INSERT_MSG) ret = processor.process_message(insert_msg, metadata) assert ret == InsertBatch([self.PROCESSED]) self.write_processed_messages([ret]) ret = ( get_cluster(StorageSetKey.EVENTS) .get_query_connection(ClickhouseClientSettings.INSERT) .execute("SELECT * FROM groupedmessage_local;") ) assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 74, # id 0, # status datetime(2019, 6, 19, 6, 46, 28), datetime(2019, 6, 19, 6, 45, 32), datetime(2019, 6, 19, 6, 45, 32), None, ) assert not message_filter.should_drop( self.__make_msg( 0, 42, self.UPDATE_MSG, [("table", "sentry_groupedmessage".encode())] ) ) update_msg = json.loads(self.UPDATE_MSG) ret = processor.process_message(update_msg, metadata) assert ret == InsertBatch([self.PROCESSED]) assert not message_filter.should_drop( self.__make_msg( 0, 42, self.DELETE_MSG, [("table", "sentry_groupedmessage".encode())] ) ) delete_msg = json.loads(self.DELETE_MSG) ret = processor.process_message(delete_msg, metadata) assert ret == InsertBatch([self.DELETED])
def process_message( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata ) -> Optional[ProcessedMessage]: assert isinstance(value, dict) # Only record outcomes from traditional error tracking events, which # excludes transactions, attachments and sessions. Once TSDB defines # models for these, we can start recording again. category = value.get("category") if category is not None and category not in DataCategory.error_categories(): return None v_uuid = value.get("event_id") message = { "org_id": value.get("org_id", 0), "project_id": value.get("project_id", 0), "key_id": value.get("key_id"), "timestamp": _ensure_valid_date( datetime.strptime(value["timestamp"], settings.PAYLOAD_DATETIME_FORMAT), ), "outcome": value["outcome"], "reason": _unicodify(value.get("reason")), "event_id": str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } return InsertBatch([message], None)
def process_message( self, message: Tuple[int, str, Dict[Any, Any]], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: event_dict, retention_days = self._structure_and_validate_message( message) or ( None, None, ) if not event_dict: return None processed: MutableMapping[str, Any] = { "deleted": 0, "retention_days": retention_days, } # The following helper functions should be able to be applied in any order. # At time of writing, there are no reads of the values in the `processed` # dictionary to inform values in other functions. # Ideally we keep continue that rule self._process_base_event_values(processed, event_dict) self._process_tags(processed, event_dict) self._process_measurements(processed, event_dict) self._process_breakdown(processed, event_dict) self._process_spans(processed, event_dict) self._process_request_data(processed, event_dict) self._process_sdk_data(processed, event_dict) processed["partition"] = metadata.partition processed["offset"] = metadata.offset # the following operation modifies the event_dict and is therefore *not* order-independent self._process_contexts_and_user(processed, event_dict) return InsertBatch([processed], None)
def test_process_message(self) -> None: meta = KafkaMessageMetadata(offset=0, partition=0, timestamp=datetime(1970, 1, 1)) message = ReplayEvent( replay_id="e5e062bf2e1d4afd96fd2f90b6770431", title="/organizations/:orgId/issues/", trace_ids=[ "36e980a9-c602-4cde-9f5d-089f15b83b5f", "8bea4461-d8b9-44f3-93c1-5a3cb1c4169a", ], sequence_id=0, timestamp=datetime.now(tz=timezone.utc).timestamp(), platform="python", dist="", user_name="me", user_id="232", user_email="*****@*****.**", ipv4="127.0.0.1", ipv6=None, environment="prod", release="34a554c14b68285d8a8eb6c5c4c56dfc1db9a83a", sdk_name="sentry.python", sdk_version="0.9.0", ) assert ReplaysProcessor().process_message( message.serialize(), meta) == InsertBatch([message.build_result(meta)], None)
def test_bulk_load(self) -> None: row = GroupAssigneeRow.from_bulk( { "project_id": "2", "group_id": "1359", "date_added": "2019-09-19 00:17:55+00", "user_id": "1", "team_id": "", } ) write_processed_messages( self.storage, [InsertBatch([row.to_clickhouse()], None)] ) ret = ( self.storage.get_cluster() .get_query_connection(ClickhouseClientSettings.QUERY) .execute("SELECT * FROM groupassignee_local;") .results ) assert ret[0] == ( 0, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id )
def test_bulk_load(self) -> None: row = GroupedMessageRow.from_bulk( { "project_id": "2", "id": "10", "status": "0", "last_seen": "2019-06-28 17:57:32+00", "first_seen": "2019-06-28 06:40:17+00", "active_at": "2019-06-28 06:40:17+00", "first_release_id": "26", } ) write_processed_messages(self.storage, [InsertBatch([row.to_clickhouse()])]) ret = ( get_cluster(StorageSetKey.EVENTS) .get_query_connection(ClickhouseClientSettings.QUERY) .execute("SELECT * FROM groupedmessage_local;") ) assert ret[0] == ( 0, # offset 0, # deleted 2, # project_id 10, # id 0, # status datetime(2019, 6, 28, 17, 57, 32), datetime(2019, 6, 28, 6, 40, 17), datetime(2019, 6, 28, 6, 40, 17), 26, )
def process_message( self, message: Tuple[int, str, InsertEvent, Any], metadata: KafkaMessageMetadata, ) -> Optional[ProcessedMessage]: """\ Process a raw message into an insertion or replacement batch. Returns `None` if the event is too old to be written. """ version = message[0] if version != 2: raise InvalidMessageVersion( f"Unsupported message version: {version}") # version 2: (2, type, data, [state]) type_, event = message[1:3] if type_ == "insert": try: row = self.process_insert(event, metadata) except EventTooOld: return None if row is None: # the processor cannot/does not handle this input return None return InsertBatch([row], None) elif type_ in REPLACEMENT_EVENT_TYPES: # pass raw events along to republish return ReplacementBatch(str(event["project_id"]), [message]) else: raise InvalidMessageType(f"Invalid message type: {type_}")
def test_base_process(self): start, finish = self.__get_timestamps() message = TransactionEvent( event_id="e5e062bf2e1d4afd96fd2f90b6770431", trace_id="7400045b25c443b885914600aa83ad04", span_id="8841662216cc598b", transaction_name="/organizations/:orgId/issues/", status="cancelled", op="navigation", timestamp=finish, start_timestamp=start, platform="python", dist="", user_name="me", user_id="myself", user_email="*****@*****.**", ipv4="127.0.0.1", ipv6=None, environment="prod", release="34a554c14b68285d8a8eb6c5c4c56dfc1db9a83a", sdk_name="sentry.python", sdk_version="0.9.0", http_method="POST", http_referer="tagstore.something", geo={"country_code": "XY", "region": "fake_region", "city": "fake_city"}, ) meta = KafkaMessageMetadata( offset=1, partition=2, timestamp=datetime(1970, 1, 1) ) assert TransactionsMessageProcessor().process_message( message.serialize(), meta ) == InsertBatch([message.build_result(meta)])
def process_message( self, message: Mapping[Any, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: try: event_dict, retention_days = self._structure_and_validate_message( message) or ( None, None, ) if not event_dict: return None processed: MutableMapping[str, Any] = { "retention_days": retention_days, } # The following helper functions should be able to be applied in any order. # At time of writing, there are no reads of the values in the `processed` # dictionary to inform values in other functions. # Ideally we keep continue that rule self._process_base_event_values(processed, event_dict) self._process_tags(processed, event_dict) self._process_sdk_data(processed, event_dict) processed["partition"] = metadata.partition processed["offset"] = metadata.offset # the following operation modifies the event_dict and is therefore *not* order-independent self._process_user(processed, event_dict) return InsertBatch([processed], None) except Exception as e: metrics.increment("consumer_error") capture_exception(e) return None
def test_ingest_session_event_abnormal(self): timestamp = datetime.now(timezone.utc) started = timestamp - timedelta(hours=1) payload = { "device_family": "iPhone12,3", "distinct_id": "b3ef3211-58a4-4b36-a9a1-5a55df0d9aaf", "duration": 1947.49, "environment": "production", "org_id": 1, "os": "iOS", "os_version": "13.3.1", "project_id": 42, "release": "[email protected]", "retention_days": 90, "seq": 42, "errors": 0, "session_id": "8333339f-5675-4f89-a9a0-1c935255ab58", "started": started.timestamp(), "status": "abnormal", "received": timestamp.timestamp(), } meta = KafkaMessageMetadata(offset=1, partition=2, timestamp=datetime(1970, 1, 1)) assert SessionsProcessor().process_message( payload, meta) == InsertBatch([{ "distinct_id": "b3ef3211-58a4-4b36-a9a1-5a55df0d9aaf", "quantity": 1, "duration": 1947490, "environment": "production", "org_id": 1, "project_id": 42, "release": "[email protected]", "retention_days": 90, "seq": 42, # abnormal counts as at least one error "errors": 1, "session_id": "8333339f-5675-4f89-a9a0-1c935255ab58", "started": started.replace(tzinfo=None), "status": 3, "received": timestamp.replace(tzinfo=None), }])
def create_event_row_for_date(self, dt: datetime) -> Mapping[str, Any]: return InsertBatch([{ "event_id": uuid.uuid4().hex, "project_id": 1, "group_id": 1, "deleted": 0, "timestamp": dt, "retention_days": settings.DEFAULT_RETENTION_DAYS, }])
def process_message(self, message, metadata) -> Optional[ProcessedMessage]: if not (isinstance(message, (list, tuple)) and len(message) >= 2): return None version = message[0] if version not in (0, 1, 2): return None type_, event = message[1:3] if type_ != "insert": return None data = event["data"] event_type = data.get("type") if event_type != "transaction": return None ret: List[MutableMapping[str, Any]] = [] # Add the transaction span transaction_ctx = data["contexts"].get("trace") if not transaction_ctx: metrics.increment("missing_trace_ctx") return None # Add the transaction root span processed = self.__init_span(event) processed["span_id"] = int(transaction_ctx["span_id"], 16) processed["transaction_name"] = _unicodify( data.get("transaction") or "") processed["parent_span_id"] = (int(transaction_ctx["parent_span_id"], 16) if "parent_span_id" in transaction_ctx else None) processed["description"] = _unicodify(data.get("transaction") or "") processed["op"] = _unicodify(transaction_ctx.get("op") or "") status = transaction_ctx.get("status", None) self.__fill_status(processed, status) self.__fill_common(processed, event["data"]) ret.append(processed) spans = data.get("spans", []) for span in spans: processed = self.__init_span(event) processed["span_id"] = int(span["span_id"], 16) processed["parent_span_id"] = int(span["parent_span_id"], 16) processed["description"] = span.get("description", "") or "" processed["op"] = span["op"] status = span.get("status", None) self.__fill_status(processed, status) self.__fill_common(processed, span) ret.append(processed) if ret: return InsertBatch(ret) else: return None
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: # some old relays accidentally emit rows without release if message["release"] is None: return None if message["duration"] is None: duration = None else: duration = _collapse_uint32(int(message["duration"] * 1000)) # since duration is not nullable, the max duration means no duration if duration is None: duration = MAX_UINT32 errors = _collapse_uint16(message["errors"]) or 0 quantity = _collapse_uint32(message.get("quantity")) or 1 # If a session ends in crashed or abnormal we want to make sure that # they count as errored too, so we can get the number of health and # errored sessions correctly. if message["status"] in ("crashed", "abnormal"): errors = max(errors, 1) received = _ensure_valid_date( datetime.utcfromtimestamp(message["received"])) started = _ensure_valid_date( datetime.utcfromtimestamp(message["started"])) if started is None: metrics.increment("empty_started_date") if received is None: metrics.increment("empty_received_date") processed = { "session_id": str(uuid.UUID(message["session_id"])), "distinct_id": str(uuid.UUID(message.get("distinct_id") or NIL_UUID)), "quantity": quantity, "seq": message["seq"], "org_id": message["org_id"], "project_id": message["project_id"], "retention_days": message["retention_days"], "duration": duration, "status": STATUS_MAPPING[message["status"]], "errors": errors, "received": received if received is not None else datetime.now(), "started": started if started is not None else datetime.now(), "release": message["release"], "environment": message.get("environment") or "", } return InsertBatch([processed], None)
def process_message( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) v_uuid = value.get("event_id") reason = value.get("reason") # relays let arbitrary outcome reasons through do the topic. We # reject undesired values only in the processor so that we can # add new ones without having to update relays through the entire # chain. if value["outcome"] == OUTCOME_CLIENT_DISCARD: if reason is not None and reason not in CLIENT_DISCARD_REASONS: reason = None if (value["outcome"] != OUTCOME_ABUSE ): # we dont care about abuse outcomes for these metrics if "category" not in value: metrics.increment("missing_category") if "quantity" not in value: metrics.increment("missing_quantity") message = None try: timestamp = _ensure_valid_date( datetime.strptime(value["timestamp"], settings.PAYLOAD_DATETIME_FORMAT), ) except Exception: metrics.increment("bad_outcome_timestamp") timestamp = _ensure_valid_date(datetime.utcnow()) try: message = { "org_id": value.get("org_id", 0), "project_id": value.get("project_id", 0), "key_id": value.get("key_id"), "timestamp": timestamp, "outcome": value["outcome"], "category": value.get("category", DataCategory.ERROR), "quantity": value.get("quantity", 1), "reason": _unicodify(reason), "event_id": str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } except Exception: metrics.increment("bad_outcome") return None return InsertBatch([message], None)
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: # TODO: Support messages with multiple buckets if not self._should_process(message): return None timestamp = _ensure_valid_date( datetime.utcfromtimestamp(message["timestamp"])) assert timestamp is not None, "Invalid timestamp" keys = [] values = [] tags = message["tags"] assert isinstance(tags, Mapping), "Invalid tags type" for key, value in sorted(tags.items()): assert key.isdigit() and isinstance(value, int), "Tag key/value invalid" keys.append(int(key)) values.append(value) mat_version = (DISABLED_MATERIALIZATION_VERSION if settings.WRITE_METRICS_AGG_DIRECTLY else settings.ENABLED_MATERIALIZATION_VERSION) try: retention_days = enforce_retention(message["retention_days"], timestamp) except EventTooOld: return None processed = { "org_id": message["org_id"], "project_id": message["project_id"], "metric_id": message["metric_id"], "timestamp": timestamp, "tags.key": keys, "tags.value": values, **self._process_values(message), "materialization_version": mat_version, "retention_days": retention_days, "partition": metadata.partition, "offset": metadata.offset, } return InsertBatch([processed], None)
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: projects = message["request"]["body"].get("project", []) if not isinstance(projects, (list, tuple)): projects = [projects] processed = { "request_id": str(uuid.UUID(message["request"]["id"])), "request_body": self.__to_json_string(message["request"]["body"]), "referrer": message["request"]["referrer"] or "", "dataset": message["dataset"], "projects": projects, # TODO: This column is empty for now, we plan to use it soon as we # will start to write org IDs into events and allow querying by org. "organization": None, **self.__extract_query_list(message["query_list"]), } # These fields are sometimes missing from the payload. If they are missing, don't # add them to processed so Clickhouse sets a default value for them. missing_fields = {} timing = message.get("timing") or {} if timing.get("timestamp") is not None: missing_fields["timestamp"] = timing["timestamp"] if timing.get("duration_ms") is not None: missing_fields["duration_ms"] = timing["duration_ms"] if message.get("status") is not None: missing_fields["status"] = message["status"] missing_keys = set(["timestamp", "duration_ms", "status"]) for key, val in missing_fields.items(): if key in processed: missing_keys.remove(key) elif val is not None: processed[key] = val missing_keys.remove(key) if missing_keys: metrics.increment( "process.missing_fields", tags={"fields": ",".join(sorted(missing_keys))}, ) return InsertBatch([processed], None)
def process_message( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) offset = metadata.offset event = value["event"] timestamp: Optional[datetime] = None if event == "begin": messages = self._process_begin(offset) elif event == "commit": messages = self._process_commit(offset) elif event == "change": if "timestamp" in value: timestamp = parse_postgres_datetime(value["timestamp"]) table_name = value["table"] if table_name != self.pg_table: return None operation = value["kind"] if operation == "insert": messages = self._process_insert(offset, value["columnnames"], value["columnvalues"]) elif operation == "update": messages = self._process_update( offset, value["oldkeys"], value["columnnames"], value["columnvalues"], ) elif operation == "delete": messages = self._process_delete(offset, value["oldkeys"]) else: raise ValueError( "Invalid value for operation in replication log: %s" % value["kind"]) else: raise ValueError("Invalid value for event in replication log: %s" % value["event"]) if not messages: return None return InsertBatch(messages, timestamp)
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: try: retention_days = message["retention_days"] if retention_days not in RETENTION_DAYS_ALLOWED: retention_days = 30 processed = { "organization_id": message["organization_id"], "project_id": message["project_id"], "transaction_id": str(UUID(message["transaction_id"])), "profile_id": str(UUID(message["profile_id"])), "received": datetime.utcfromtimestamp(message["received"]), "profile": message["profile"], "android_api_level": message.get("android_api_level"), "device_classification": message["device_classification"], "device_locale": message["device_locale"], "device_manufacturer": message["device_manufacturer"], "device_model": message["device_model"], "device_os_build_number": message.get("device_os_build_number"), "device_os_name": message["device_os_name"], "device_os_version": message["device_os_version"], "duration_ns": message["duration_ns"], "environment": message.get("environment"), "platform": message["platform"], "trace_id": str(UUID(message["trace_id"])), "transaction_name": message["transaction_name"], "version_name": message["version_name"], "version_code": message["version_code"], "retention_days": retention_days, "offset": metadata.offset, "partition": metadata.partition, } except ValueError: metrics.increment("invalid_uuid") return None except KeyError: metrics.increment("missing_field") return None return InsertBatch([processed], None)
def process_message(self, message, metadata) -> Optional[ProcessedMessage]: projects = message["request"]["body"].get("project", []) if not isinstance(projects, (list, tuple)): projects = [projects] processed = { "request_id": str(uuid.UUID(message["request"]["id"])), "request_body": self.__to_json_string(message["request"]["body"]), "referrer": message["request"]["referrer"] or "", "dataset": message["dataset"], "projects": projects, # TODO: This column is empty for now, we plan to use it soon as we # will start to write org IDs into events and allow querying by org. "organization": None, "timestamp": message["timing"]["timestamp"], "duration_ms": message["timing"]["duration_ms"], "status": message["status"], **self.__extract_query_list(message["query_list"]), } return InsertBatch([processed])
def process_message(self, value, metadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) offset = metadata.offset event = value["event"] if event == "begin": messages = self._process_begin(offset) elif event == "commit": messages = self._process_commit(offset) elif event == "change": table_name = value["table"] if table_name != self.pg_table: return None operation = value["kind"] if operation == "insert": messages = self._process_insert(offset, value["columnnames"], value["columnvalues"]) elif operation == "update": messages = self._process_update( offset, value["oldkeys"], value["columnnames"], value["columnvalues"], ) elif operation == "delete": messages = self._process_delete(offset, value["oldkeys"]) else: raise ValueError( "Invalid value for operation in replication log: %s" % value["kind"]) else: raise ValueError("Invalid value for event in replication log: %s" % value["event"]) if not messages: return None return InsertBatch(messages)
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: # TODO: Support messages with multiple buckets if not self._should_process(message): return None timestamp = _ensure_valid_date( datetime.utcfromtimestamp(message["timestamp"])) assert timestamp is not None keys = [] values = [] tags = message["tags"] assert isinstance(tags, Mapping) for key, value in sorted(tags.items()): assert key.isdigit() keys.append(int(key)) assert isinstance(value, int) values.append(value) processed = { "org_id": message["org_id"], "project_id": message["project_id"], "metric_id": message["metric_id"], "timestamp": timestamp, "tags.key": keys, "tags.value": values, **self._process_values(message), "materialization_version": 0, "retention_days": message["retention_days"], "partition": metadata.partition, "offset": metadata.offset, } return InsertBatch([processed], None)
def process_message( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) v_uuid = value.get("event_id") if value["outcome"] != 4: # we dont care about abuse outcomes for these metrics if "category" not in value: metrics.increment("missing_category") if "quantity" not in value: metrics.increment("missing_quantity") message = { "org_id": value.get("org_id", 0), "project_id": value.get("project_id", 0), "key_id": value.get("key_id"), "timestamp": _ensure_valid_date( datetime.strptime(value["timestamp"], settings.PAYLOAD_DATETIME_FORMAT), ), "outcome": value["outcome"], "category": value.get("category", DataCategory.ERROR), "quantity": value.get("quantity", 1), "reason": _unicodify(value.get("reason")), "event_id": str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } return InsertBatch([message], None)
def process_message(self, message, metadata) -> Optional[ProcessedMessage]: processed = {"deleted": 0} if not (isinstance(message, (list, tuple)) and len(message) >= 2): return None version = message[0] if version not in (0, 1, 2): return None type_, event = message[1:3] if type_ != "insert": return None data = event["data"] event_type = data.get("type") if event_type != "transaction": return None extract_base(processed, event) processed["retention_days"] = enforce_retention( event, datetime.fromtimestamp(data["timestamp"]), ) if not data.get("contexts", {}).get("trace"): return None transaction_ctx = data["contexts"]["trace"] trace_id = transaction_ctx["trace_id"] try: processed["event_id"] = str(uuid.UUID(processed["event_id"])) processed["trace_id"] = str(uuid.UUID(trace_id)) processed["span_id"] = int(transaction_ctx["span_id"], 16) processed["transaction_op"] = _unicodify( transaction_ctx.get("op") or "") processed["transaction_name"] = _unicodify( data.get("transaction") or "") processed[ "start_ts"], processed["start_ms"] = self.__extract_timestamp( data["start_timestamp"], ) status = transaction_ctx.get("status", None) if status: int_status = SPAN_STATUS_NAME_TO_CODE.get( status, UNKNOWN_SPAN_STATUS) else: int_status = UNKNOWN_SPAN_STATUS processed["transaction_status"] = int_status if data["timestamp"] - data["start_timestamp"] < 0: # Seems we have some negative durations in the DB metrics.increment("negative_duration") except Exception: # all these fields are required but we saw some events go through here # in the past. For now bail. return processed["finish_ts"], processed[ "finish_ms"] = self.__extract_timestamp(data["timestamp"], ) duration_secs = (processed["finish_ts"] - processed["start_ts"]).total_seconds() processed["duration"] = max(int(duration_secs * 1000), 0) processed["platform"] = _unicodify(event["platform"]) tags = _as_dict_safe(data.get("tags", None)) processed["tags.key"], processed["tags.value"] = extract_extra_tags( tags) processed["_tags_flattened"] = flatten_nested_field( processed["tags.key"], processed["tags.value"]) promoted_tags = { col: tags[col] for col in self.PROMOTED_TAGS if col in tags } processed["release"] = promoted_tags.get( "sentry:release", event.get("release"), ) processed["environment"] = promoted_tags.get("environment") contexts = _as_dict_safe(data.get("contexts", None)) user_dict = data.get("user", data.get("sentry.interfaces.User", None)) or {} geo = user_dict.get("geo", None) or {} if "geo" not in contexts and isinstance(geo, dict): contexts["geo"] = geo measurements = data.get("measurements") if measurements is not None: try: ( processed["measurements.key"], processed["measurements.value"], ) = extract_nested(measurements, lambda value: float(value["value"])) except Exception: # Not failing the event in this case just yet, because we are still # developing this feature. logger.error( "Invalid measurements field.", extra={"measurements": measurements}, exc_info=True, ) request = data.get("request", data.get("sentry.interfaces.Http", None)) or {} http_data: MutableMapping[str, Any] = {} extract_http(http_data, request) processed["http_method"] = http_data["http_method"] processed["http_referer"] = http_data["http_referer"] processed["contexts.key"], processed[ "contexts.value"] = extract_extra_contexts(contexts) processed["_contexts_flattened"] = flatten_nested_field( processed["contexts.key"], processed["contexts.value"]) processed["dist"] = _unicodify( promoted_tags.get("sentry:dist", data.get("dist")), ) user_data = {} extract_user(user_data, user_dict) processed["user"] = promoted_tags.get("sentry:user", "") processed["user_name"] = user_data["username"] processed["user_id"] = user_data["user_id"] processed["user_email"] = user_data["email"] ip_address = _ensure_valid_ip(user_data["ip_address"]) if ip_address: if ip_address.version == 4: processed["ip_address_v4"] = str(ip_address) elif ip_address.version == 6: processed["ip_address_v6"] = str(ip_address) processed["partition"] = metadata.partition processed["offset"] = metadata.offset sdk = data.get("sdk", None) or {} processed["sdk_name"] = _unicodify(sdk.get("name") or "") processed["sdk_version"] = _unicodify(sdk.get("version") or "") if processed["sdk_name"] == "": metrics.increment("missing_sdk_name") if processed["sdk_version"] == "": metrics.increment("missing_sdk_version") return InsertBatch([processed])
def test_error_processor() -> None: received_timestamp = datetime.now() - timedelta(minutes=1) error_timestamp = received_timestamp - timedelta(minutes=1) trace_id = str(uuid.uuid4()) span_id = "deadbeef" error = ( 2, "insert", InsertEvent({ "organization_id": 1, "retention_days": 58, "event_id": "dcb9d002cac548c795d1c9adbfc68040", "group_id": 100, "project_id": 300688, "platform": "python", "message": "", "datetime": error_timestamp.strftime(PAYLOAD_DATETIME_FORMAT), "primary_hash": "04233d08ac90cf6fc015b1be5932e7e2", "data": { "event_id": "dcb9d002cac548c795d1c9adbfc68040", "project_id": 300688, "release": None, "dist": None, "platform": "python", "message": "", "datetime": error_timestamp.strftime(PAYLOAD_DATETIME_FORMAT), "tags": [ ["handled", "no"], ["level", "error"], ["mechanism", "excepthook"], ["runtime", "CPython 3.7.6"], ["runtime.name", "CPython"], ["server_name", "snuba"], ["environment", "dev"], ["sentry:user", "this_is_me"], ["sentry:release", "4d23338017cdee67daf25f2c"], ], "user": { "username": "******", "ip_address": "127.0.0.1", "id": "still_me", "email": "*****@*****.**", "geo": { "country_code": "XY", "region": "fake_region", "city": "fake_city", }, }, "request": { "url": "http://127.0.0.1:/query", "headers": [ ["Accept-Encoding", "identity"], ["Content-Length", "398"], ["Host", "127.0.0.1:"], ["Referer", "tagstore.something"], ["Trace", "8fa73032d-1"], ], "data": "", "method": "POST", "env": { "SERVER_PORT": "1010", "SERVER_NAME": "snuba" }, }, "_relay_processed": True, "breadcrumbs": { "values": [ { "category": "snuba.utils.streams.batching", "level": "info", "timestamp": error_timestamp.timestamp(), "data": { "asctime": error_timestamp.strftime( PAYLOAD_DATETIME_FORMAT) }, "message": "New partitions assigned: {}", "type": "default", }, { "category": "snuba.utils.streams.batching", "level": "info", "timestamp": error_timestamp.timestamp(), "data": { "asctime": error_timestamp.strftime( PAYLOAD_DATETIME_FORMAT) }, "message": "Flushing ", "type": "default", }, { "category": "httplib", "timestamp": error_timestamp.timestamp(), "type": "http", "data": { "url": "http://127.0.0.1:8123/", "status_code": 500, "reason": "Internal Server Error", "method": "POST", }, "level": "info", }, ] }, "contexts": { "runtime": { "version": "3.7.6", "type": "runtime", "name": "CPython", "build": "3.7.6", }, "trace": { "trace_id": trace_id, "span_id": span_id }, }, "culprit": "snuba.clickhouse.http in write", "exception": { "values": [{ "stacktrace": { "frames": [ { "function": "<module>", "abs_path": "/usr/local/bin/snuba", "pre_context": [ "from pkg_resources import load_entry_point", "", "if __name__ == '__main__':", " sys.argv[0] = re.sub(r'(-script\\.pyw?|\\.exe)?$', '', sys.argv[0])", " sys.exit(", ], "post_context": [" )"], "vars": { "__spec__": "None", "__builtins__": "<module 'builtins' (built-in)>", "__annotations__": {}, "__file__": "'/usr/local/bin/snuba'", "__loader__": "<_frozen_importlib_external.SourceFileLoader object at 0x7fbbc3a36ed0>", "__requires__": "'snuba'", "__cached__": "None", "__name__": "'__main__'", "__package__": "None", "__doc__": "None", }, "module": "__main__", "filename": "snuba", "lineno": 11, "in_app": False, "data": { "orig_in_app": 1 }, "context_line": " load_entry_point('snuba', 'console_scripts', 'snuba')()", }, ] }, "type": "ClickHouseError", "module": "snuba.clickhouse.http", "value": "[171] DB::Exception: Block structure mismatch", "mechanism": { "type": "excepthook", "handled": False }, }] }, "extra": { "sys.argv": [ "/usr/local/bin/snuba", "consumer", "--dataset", "transactions", ] }, "fingerprint": ["{{ default }}"], "hashes": ["c8b21c571231e989060b9110a2ade7d3"], "hierarchical_hashes": [ "04233d08ac90cf6fc015b1be5932e7e3", "04233d08ac90cf6fc015b1be5932e7e4", ], "key_id": "537125", "level": "error", "location": "snuba/clickhouse/http.py", "logger": "", "metadata": { "function": "write", "type": "ClickHouseError", "value": "[171] DB::Exception: Block structure mismatch", "filename": "snuba/something.py", }, "modules": { "cffi": "1.13.2", "ipython-genutils": "0.2.0", "isodate": "0.6.0", }, "received": received_timestamp.timestamp(), "sdk": { "version": "0.0.0.0.1", "name": "sentry.python", "packages": [{ "version": "0.0.0.0.1", "name": "pypi:sentry-sdk" }], "integrations": [ "argv", "atexit", "dedupe", "excepthook", "logging", "modules", "stdlib", "threading", ], }, "timestamp": error_timestamp.timestamp(), "title": "ClickHouseError: [171] DB::Exception: Block structure mismatch", "type": "error", "version": "7", }, }), None, ) expected_result = { "project_id": 300688, "timestamp": error_timestamp, "event_id": str(UUID("dcb9d002cac548c795d1c9adbfc68040")), "platform": "python", "dist": None, "environment": "dev", "release": "4d23338017cdee67daf25f2c", "ip_address_v4": "127.0.0.1", "user": "******", "user_name": "me", "user_id": "still_me", "user_email": "*****@*****.**", "sdk_name": "sentry.python", "sdk_version": "0.0.0.0.1", "http_method": "POST", "http_referer": "tagstore.something", "trace_id": trace_id, "span_id": int(span_id, 16), "tags.key": [ "environment", "handled", "level", "mechanism", "runtime", "runtime.name", "sentry:release", "sentry:user", "server_name", ], "tags.value": [ "dev", "no", "error", "excepthook", "CPython 3.7.6", "CPython", "4d23338017cdee67daf25f2c", "this_is_me", "snuba", ], "contexts.key": [ "runtime.version", "runtime.name", "runtime.build", "trace.trace_id", "trace.span_id", "geo.country_code", "geo.region", "geo.city", ], "contexts.value": [ "3.7.6", "CPython", "3.7.6", trace_id, span_id, "XY", "fake_region", "fake_city", ], "partition": 1, "offset": 2, "message_timestamp": datetime(1970, 1, 1), "retention_days": 90, "deleted": 0, "group_id": 100, "primary_hash": str(UUID("04233d08ac90cf6fc015b1be5932e7e2")), "hierarchical_hashes": [ str(UUID("04233d08ac90cf6fc015b1be5932e7e3")), str(UUID("04233d08ac90cf6fc015b1be5932e7e4")), ], "received": received_timestamp.astimezone(pytz.utc).replace(tzinfo=None, microsecond=0), "message": "", "title": "ClickHouseError: [171] DB::Exception: Block structure mismatch", "culprit": "snuba.clickhouse.http in write", "level": "error", "location": "snuba/clickhouse/http.py", "version": "7", "type": "error", "exception_stacks.type": ["ClickHouseError"], "exception_stacks.value": ["[171] DB::Exception: Block structure mismatch"], "exception_stacks.mechanism_type": ["excepthook"], "exception_stacks.mechanism_handled": [False], "exception_frames.abs_path": ["/usr/local/bin/snuba"], "exception_frames.colno": [None], "exception_frames.filename": ["snuba"], "exception_frames.lineno": [11], "exception_frames.in_app": [False], "exception_frames.package": [None], "exception_frames.module": ["__main__"], "exception_frames.function": ["<module>"], "exception_frames.stack_level": [0], "sdk_integrations": [ "argv", "atexit", "dedupe", "excepthook", "logging", "modules", "stdlib", "threading", ], "modules.name": ["cffi", "ipython-genutils", "isodate"], "modules.version": ["1.13.2", "0.2.0", "0.6.0"], "transaction_name": "", } meta = KafkaMessageMetadata(offset=2, partition=1, timestamp=datetime(1970, 1, 1)) processor = ErrorsProcessor({ "environment": "environment", "sentry:release": "release", "sentry:dist": "dist", "sentry:user": "******", "transaction": "transaction_name", "level": "level", }) processed_message = processor.process_message(error, meta) expected_message = InsertBatch([expected_result], None) # assert on the rows first so we get a nice diff from pytest assert processed_message.rows[0] == expected_message.rows[0] assert processed_message == expected_message
from snuba.clusters.cluster import ClickhouseClientSettings from snuba.datasets.storages import StorageKey from snuba.datasets.storages.factory import get_writable_storage from snuba.processor import InsertBatch from tests.helpers import write_processed_messages test_data = [ pytest.param( StorageKey.EVENTS, lambda dt: InsertBatch( [ { "event_id": uuid.uuid4().hex, "project_id": 1, "group_id": 1, "deleted": 0, "timestamp": dt, "retention_days": settings.DEFAULT_RETENTION_DAYS, } ], None, ), id="events", ), pytest.param( StorageKey.ERRORS, lambda dt: InsertBatch( [ { "event_id": str(uuid.uuid4()), "project_id": 1,
"offset": 1, "project_id": 2, "id": 74, "record_deleted": 0, "status": 0, "last_seen": datetime(2019, 6, 19, 6, 46, 28, tzinfo=pytz.UTC), "first_seen": datetime(2019, 6, 19, 6, 45, 32, tzinfo=pytz.UTC), "active_at": datetime(2019, 6, 19, 6, 45, 32, tzinfo=pytz.UTC), "first_release_id": None, } test_data = [ (90, None), (100, None), (110, None), (120, InsertBatch([PROCESSED])), (210, InsertBatch([PROCESSED])), ] @pytest.mark.parametrize("xid, expected", test_data) def test_send_message(xid: int, expected: Optional[ProcessedMessage]) -> None: processor = (get_writable_storage(StorageKey.GROUPEDMESSAGES). get_table_writer().get_stream_loader().get_processor()) worker = SnapshotProcessor( processor=processor, snapshot_id=SnapshotId(str(uuid1())), transaction_data=TransactionData(xmin=Xid(100), xmax=Xid(200), xip_list=[Xid(120),
def test_streaming_consumer_strategy() -> None: messages = (Message( Partition(Topic("events"), 0), i, KafkaPayload(None, b"{}", None), datetime.now(), ) for i in itertools.count()) replacements_producer = FakeConfluentKafkaProducer() processor = Mock() processor.process_message.side_effect = [ None, InsertBatch([{}]), ReplacementBatch("key", [{}]), ] writer = Mock() metrics = TestingMetricsBackend() factory = StreamingConsumerStrategyFactory( None, processor, writer, metrics, max_batch_size=10, max_batch_time=60, processes=None, input_block_size=None, output_block_size=None, replacements_producer=replacements_producer, replacements_topic=Topic("replacements"), ) commit_function = Mock() strategy = factory.create(commit_function) for i in range(3): strategy.poll() strategy.submit(next(messages)) assert metrics.calls == [] processor.process_message.side_effect = [{}] with pytest.raises(TypeError): strategy.poll() strategy.submit(next(messages)) def get_number_of_insertion_metrics() -> int: count = 0 for call in metrics.calls: if isinstance(call, Timing) and call.name == "insertions.latency_ms": count += 1 return count expected_write_count = 1 with assert_changes(get_number_of_insertion_metrics, 0, expected_write_count), assert_changes( lambda: writer.write.call_count, 0, expected_write_count), assert_changes( lambda: len(replacements_producer.messages), 0, 1): strategy.close() strategy.join()