def test_extract_required(self): now = datetime.utcnow() event = { "event_id": "1" * 32, "project_id": 100, "group_id": 10, "datetime": now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), } output = {} extract_base(output, event) output["retention_days"] = enforce_retention( event, datetime.strptime(event["datetime"], settings.PAYLOAD_DATETIME_FORMAT), ) enforce_table_writer( self.dataset).get_stream_loader().get_processor().extract_required( output, event) assert output == { "event_id": "11111111111111111111111111111111", "project_id": 100, "group_id": 10, "timestamp": now, "retention_days": settings.DEFAULT_RETENTION_DAYS, }
def __init_span(self, event: Mapping[str, Any]) -> MutableMapping[str, Any]: """ Initializes the fields that are the same for all spans within a transaction. """ data = event["data"] transaction_ctx = data["contexts"]["trace"] return { "deleted": 0, "project_id": event["project_id"], "transaction_id": str(uuid.UUID(event["event_id"])), "retention_days": enforce_retention(event, datetime.fromtimestamp(data["timestamp"])), "transaction_span_id": int(transaction_ctx["span_id"], 16), "trace_id": str(uuid.UUID(transaction_ctx["trace_id"])), "transaction_name": _unicodify(data.get("transaction") or ""), }
def process_insert( self, event: Mapping[str, Any], metadata: Optional[KafkaMessageMetadata] = None ) -> Optional[Mapping[str, Any]]: if not self._should_process(event): return None processed = {"deleted": 0} extract_project_id(processed, event) self._extract_event_id(processed, event) processed["retention_days"] = enforce_retention( event, datetime.strptime(event["datetime"], settings.PAYLOAD_DATETIME_FORMAT), ) self.extract_required(processed, event) data = event.get("data", {}) # HACK: https://sentry.io/sentry/snuba/issues/802102397/ if not data: logger.error("No data for event: %s", event, exc_info=True) return None self.extract_common(processed, event, metadata) self.extract_custom(processed, event, metadata) sdk = data.get("sdk", None) or {} self.extract_sdk(processed, sdk) tags = _as_dict_safe(data.get("tags", None)) self.extract_promoted_tags(processed, tags) self.extract_tags_custom(processed, event, tags, metadata) contexts = data.get("contexts", None) or {} self.extract_promoted_contexts(processed, contexts, tags) self.extract_contexts_custom(processed, event, contexts, metadata) processed["contexts.key"], processed[ "contexts.value"] = extract_extra_contexts(contexts) processed["tags.key"], processed["tags.value"] = extract_extra_tags( tags) processed["_tags_flattened"] = flatten_nested_field( processed["tags.key"], processed["tags.value"]) exception = (data.get("exception", data.get("sentry.interfaces.Exception", None)) or {}) stacks = exception.get("values", None) or [] self.extract_stacktraces(processed, stacks) if metadata is not None: processed["offset"] = metadata.offset processed["partition"] = metadata.partition return processed
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: if not self._should_process(message): return None timestamp = _ensure_valid_date( datetime.utcfromtimestamp(message["timestamp"])) assert timestamp is not None keys = [] values = [] tags = message["tags"] assert isinstance(tags, Mapping) for key, value in sorted(tags.items()): assert key.isdigit() keys.append(int(key)) assert isinstance(value, int) values.append(value) try: retention_days = enforce_retention(message["retention_days"], timestamp) except EventTooOld: return None processed = [{ "org_id": _literal(message["org_id"]), "project_id": _literal(message["project_id"]), "metric_id": _literal(message["metric_id"]), "timestamp": _call( "toDateTime", (_literal( timestamp_to_bucket(timestamp, granularity).isoformat()), ), ), "tags.key": _array_literal(keys), "tags.value": _array_literal(values), **self._process_values(message), "retention_days": _literal(retention_days), "granularity": _literal(granularity), } for granularity in self.GRANULARITIES_SECONDS] return AggregateInsertBatch(processed, None)
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: # TODO: Support messages with multiple buckets if not self._should_process(message): return None timestamp = _ensure_valid_date( datetime.utcfromtimestamp(message["timestamp"])) assert timestamp is not None, "Invalid timestamp" keys = [] values = [] tags = message["tags"] assert isinstance(tags, Mapping), "Invalid tags type" for key, value in sorted(tags.items()): assert key.isdigit() and isinstance(value, int), "Tag key/value invalid" keys.append(int(key)) values.append(value) mat_version = (DISABLED_MATERIALIZATION_VERSION if settings.WRITE_METRICS_AGG_DIRECTLY else settings.ENABLED_MATERIALIZATION_VERSION) try: retention_days = enforce_retention(message["retention_days"], timestamp) except EventTooOld: return None processed = { "org_id": message["org_id"], "project_id": message["project_id"], "metric_id": message["metric_id"], "timestamp": timestamp, "tags.key": keys, "tags.value": values, **self._process_values(message), "materialization_version": mat_version, "retention_days": retention_days, "partition": metadata.partition, "offset": metadata.offset, } return InsertBatch([processed], None)
def _structure_and_validate_message( self, message: Mapping[Any, Any]) -> Optional[Tuple[EventDict, RetentionDays]]: event = message data = event["data"] try: # We are purposely using a naive datetime here to work with the # rest of the codebase. We can be confident that clients are only # sending UTC dates. retention_days = enforce_retention( message["retention_days"], datetime.utcfromtimestamp(data["timestamp"])) except EventTooOld: return None return event, retention_days
def process_message(self, message, metadata) -> Optional[ProcessedMessage]: processed = {"deleted": 0} if not (isinstance(message, (list, tuple)) and len(message) >= 2): return None version = message[0] if version not in (0, 1, 2): return None type_, event = message[1:3] if type_ != "insert": return None data = event["data"] event_type = data.get("type") if event_type != "transaction": return None extract_base(processed, event) processed["retention_days"] = enforce_retention( event, datetime.fromtimestamp(data["timestamp"]), ) if not data.get("contexts", {}).get("trace"): return None transaction_ctx = data["contexts"]["trace"] trace_id = transaction_ctx["trace_id"] try: processed["event_id"] = str(uuid.UUID(processed["event_id"])) processed["trace_id"] = str(uuid.UUID(trace_id)) processed["span_id"] = int(transaction_ctx["span_id"], 16) processed["transaction_op"] = _unicodify( transaction_ctx.get("op") or "") processed["transaction_name"] = _unicodify( data.get("transaction") or "") processed[ "start_ts"], processed["start_ms"] = self.__extract_timestamp( data["start_timestamp"], ) status = transaction_ctx.get("status", None) if status: int_status = SPAN_STATUS_NAME_TO_CODE.get( status, UNKNOWN_SPAN_STATUS) else: int_status = UNKNOWN_SPAN_STATUS processed["transaction_status"] = int_status if data["timestamp"] - data["start_timestamp"] < 0: # Seems we have some negative durations in the DB metrics.increment("negative_duration") except Exception: # all these fields are required but we saw some events go through here # in the past. For now bail. return processed["finish_ts"], processed[ "finish_ms"] = self.__extract_timestamp(data["timestamp"], ) duration_secs = (processed["finish_ts"] - processed["start_ts"]).total_seconds() processed["duration"] = max(int(duration_secs * 1000), 0) processed["platform"] = _unicodify(event["platform"]) tags = _as_dict_safe(data.get("tags", None)) processed["tags.key"], processed["tags.value"] = extract_extra_tags( tags) processed["_tags_flattened"] = flatten_nested_field( processed["tags.key"], processed["tags.value"]) promoted_tags = { col: tags[col] for col in self.PROMOTED_TAGS if col in tags } processed["release"] = promoted_tags.get( "sentry:release", event.get("release"), ) processed["environment"] = promoted_tags.get("environment") contexts = _as_dict_safe(data.get("contexts", None)) user_dict = data.get("user", data.get("sentry.interfaces.User", None)) or {} geo = user_dict.get("geo", None) or {} if "geo" not in contexts and isinstance(geo, dict): contexts["geo"] = geo measurements = data.get("measurements") if measurements is not None: try: ( processed["measurements.key"], processed["measurements.value"], ) = extract_nested(measurements, lambda value: float(value["value"])) except Exception: # Not failing the event in this case just yet, because we are still # developing this feature. logger.error( "Invalid measurements field.", extra={"measurements": measurements}, exc_info=True, ) request = data.get("request", data.get("sentry.interfaces.Http", None)) or {} http_data: MutableMapping[str, Any] = {} extract_http(http_data, request) processed["http_method"] = http_data["http_method"] processed["http_referer"] = http_data["http_referer"] processed["contexts.key"], processed[ "contexts.value"] = extract_extra_contexts(contexts) processed["_contexts_flattened"] = flatten_nested_field( processed["contexts.key"], processed["contexts.value"]) processed["dist"] = _unicodify( promoted_tags.get("sentry:dist", data.get("dist")), ) user_data = {} extract_user(user_data, user_dict) processed["user"] = promoted_tags.get("sentry:user", "") processed["user_name"] = user_data["username"] processed["user_id"] = user_data["user_id"] processed["user_email"] = user_data["email"] ip_address = _ensure_valid_ip(user_data["ip_address"]) if ip_address: if ip_address.version == 4: processed["ip_address_v4"] = str(ip_address) elif ip_address.version == 6: processed["ip_address_v6"] = str(ip_address) processed["partition"] = metadata.partition processed["offset"] = metadata.offset sdk = data.get("sdk", None) or {} processed["sdk_name"] = _unicodify(sdk.get("name") or "") processed["sdk_version"] = _unicodify(sdk.get("version") or "") if processed["sdk_name"] == "": metrics.increment("missing_sdk_name") if processed["sdk_version"] == "": metrics.increment("missing_sdk_version") return InsertBatch([processed])
def process_message( self, message: Tuple[int, str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: processed: MutableMapping[str, Any] = {"deleted": 0} if not (isinstance(message, (list, tuple)) and len(message) >= 2): return None version = message[0] if version not in (0, 1, 2): return None type_, event = message[1:3] if type_ != "insert": return None data = event["data"] event_type = data.get("type") if event_type != "transaction": return None extract_base(processed, event) try: # We are purposely using a naive datetime here to work with the # rest of the codebase. We can be confident that clients are only # sending UTC dates. processed["retention_days"] = enforce_retention( event, datetime.utcfromtimestamp(data["timestamp"]), ) except EventTooOld: return None if not data.get("contexts", {}).get("trace"): return None transaction_ctx = data["contexts"]["trace"] trace_id = transaction_ctx["trace_id"] processed["event_id"] = str(uuid.UUID(processed["event_id"])) processed["trace_id"] = str(uuid.UUID(trace_id)) processed["span_id"] = int(transaction_ctx["span_id"], 16) processed["transaction_op"] = _unicodify( transaction_ctx.get("op") or "") processed["transaction_name"] = _unicodify( data.get("transaction") or "") processed["start_ts"], processed[ "start_ms"] = self.__extract_timestamp(data["start_timestamp"], ) status = transaction_ctx.get("status", None) if status: int_status = SPAN_STATUS_NAME_TO_CODE.get(status, UNKNOWN_SPAN_STATUS) else: int_status = UNKNOWN_SPAN_STATUS processed["transaction_status"] = int_status if data["timestamp"] - data["start_timestamp"] < 0: # Seems we have some negative durations in the DB metrics.increment("negative_duration") processed["finish_ts"], processed[ "finish_ms"] = self.__extract_timestamp(data["timestamp"], ) duration_secs = (processed["finish_ts"] - processed["start_ts"]).total_seconds() processed["duration"] = max(int(duration_secs * 1000), 0) processed["platform"] = _unicodify(event["platform"]) tags: Mapping[str, Any] = _as_dict_safe(data.get("tags", None)) processed["tags.key"], processed["tags.value"] = extract_extra_tags( tags) promoted_tags = { col: tags[col] for col in self.PROMOTED_TAGS if col in tags } processed["release"] = promoted_tags.get( "sentry:release", event.get("release"), ) processed["environment"] = promoted_tags.get("environment") contexts: MutableMapping[str, Any] = _as_dict_safe( data.get("contexts", None)) user_dict = data.get("user", data.get("sentry.interfaces.User", None)) or {} geo = user_dict.get("geo", None) or {} if "geo" not in contexts and isinstance(geo, dict): contexts["geo"] = geo measurements = data.get("measurements") if measurements is not None: try: ( processed["measurements.key"], processed["measurements.value"], ) = extract_nested( measurements, lambda value: float(value["value"]) if (value is not None and isinstance( value.get("value"), numbers.Number)) else None, ) except Exception: # Not failing the event in this case just yet, because we are still # developing this feature. logger.error( "Invalid measurements field.", extra={"measurements": measurements}, exc_info=True, ) breakdowns = data.get("breakdowns") if breakdowns is not None: span_op_breakdowns = breakdowns.get("span_ops") if span_op_breakdowns is not None: try: ( processed["span_op_breakdowns.key"], processed["span_op_breakdowns.value"], ) = extract_nested( span_op_breakdowns, lambda value: float(value["value"]) if (value is not None and isinstance( value.get("value"), numbers.Number)) else None, ) except Exception: # Not failing the event in this case just yet, because we are still # developing this feature. logger.error( "Invalid breakdowns.span_ops field.", extra={"span_op_breakdowns": span_op_breakdowns}, exc_info=True, ) request = data.get("request", data.get("sentry.interfaces.Http", None)) or {} http_data: MutableMapping[str, Any] = {} extract_http(http_data, request) processed["http_method"] = http_data["http_method"] processed["http_referer"] = http_data["http_referer"] skipped_contexts = settings.TRANSACT_SKIP_CONTEXT_STORE.get( processed["project_id"], set()) for context in skipped_contexts: if context in contexts: del contexts[context] processed["contexts.key"], processed[ "contexts.value"] = extract_extra_contexts(contexts) processed["dist"] = _unicodify( promoted_tags.get("sentry:dist", data.get("dist")), ) user_data: MutableMapping[str, Any] = {} extract_user(user_data, user_dict) processed["user"] = promoted_tags.get("sentry:user", "") processed["user_name"] = user_data["username"] processed["user_id"] = user_data["user_id"] processed["user_email"] = user_data["email"] ip_address = _ensure_valid_ip(user_data["ip_address"]) if ip_address: if ip_address.version == 4: processed["ip_address_v4"] = str(ip_address) elif ip_address.version == 6: processed["ip_address_v6"] = str(ip_address) processed["partition"] = metadata.partition processed["offset"] = metadata.offset sdk = data.get("sdk", None) or {} processed["sdk_name"] = _unicodify(sdk.get("name") or "") processed["sdk_version"] = _unicodify(sdk.get("version") or "") if processed["sdk_name"] == "": metrics.increment("missing_sdk_name") if processed["sdk_version"] == "": metrics.increment("missing_sdk_version") return InsertBatch([processed], None)
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: # some old relays accidentally emit rows without release if message["release"] is None: return None if message["duration"] is None: duration = None else: duration = _collapse_uint32(int(message["duration"] * 1000)) # since duration is not nullable, the max duration means no duration if duration is None: duration = MAX_UINT32 errors = _collapse_uint16(message["errors"]) or 0 quantity = _collapse_uint32(message.get("quantity")) or 1 # If a session ends in crashed or abnormal we want to make sure that # they count as errored too, so we can get the number of health and # errored sessions correctly. if message["status"] in ("crashed", "abnormal"): errors = max(errors, 1) received = _ensure_valid_date( datetime.utcfromtimestamp(message["received"])) started = _ensure_valid_date( datetime.utcfromtimestamp(message["started"])) if started is None: metrics.increment("empty_started_date") if received is None: metrics.increment("empty_received_date") processed = { "session_id": str(uuid.UUID(message["session_id"])), "distinct_id": str(uuid.UUID(message.get("distinct_id") or NIL_UUID)), "quantity": quantity, "seq": message["seq"], "org_id": message["org_id"], "project_id": message["project_id"], "retention_days": enforce_retention(message["retention_days"], received), "duration": duration, "status": STATUS_MAPPING[message["status"]], "errors": errors, "received": received if received is not None else datetime.now(), "started": started if started is not None else datetime.now(), "release": message["release"], "environment": message.get("environment") or "", } return InsertBatch([processed], None)