def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, producer, replacement_topic.topic_name, self.metrics) test_worker.flush_batch([ ProcessedMessage( action=ProcessorAction.REPLACE, data=[('1', { 'project_id': 1 })], ), ProcessedMessage( action=ProcessorAction.REPLACE, data=[('2', { 'project_id': 2 })], ), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == \ [('event-replacements', b'1', b'{"project_id": 1}'), ('event-replacements', b'2', b'{"project_id": 2}')]
def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() test_worker = ConsumerWorker( self.dataset, producer=producer, replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) test_worker.flush_batch( [ ProcessedMessage( action=ProcessorAction.REPLACE, data=[("1", {"project_id": 1})], ), ProcessedMessage( action=ProcessorAction.REPLACE, data=[("2", {"project_id": 2})], ), ] ) assert [(m._topic, m._key, m._value) for m in producer.messages] == [ ("event-replacements", b"1", b'{"project_id": 1}'), ("event-replacements", b"2", b'{"project_id": 2}'), ]
def test_v2_start_merge(self): project_id = 1 message = (2, "start_merge", {"project_id": project_id}) processor = (enforce_table_writer( self.dataset).get_stream_loader().get_processor()) assert processor.process_message(message) == ProcessedMessage( action=ProcessorAction.REPLACE, data=[(str(project_id), message)])
def process_message(self, message, metadata=None) -> Optional[ProcessedMessage]: action_type = ProcessorAction.INSERT projects = message["request"]["body"].get("project", []) if not isinstance(projects, (list, tuple)): projects = [projects] processed = { "request_id": str(uuid.UUID(message["request"]["id"])), "request_body": self.__to_json_string(message["request"]["body"]), "referrer": message["request"]["referrer"] or "", "dataset": message["dataset"], "projects": projects, # TODO: This column is empty for now, we plan to use it soon as we # will start to write org IDs into events and allow querying by org. "organization": None, "timestamp": message["timing"]["timestamp"], "duration_ms": message["timing"]["duration_ms"], "status": message["status"], **self.__extract_query_list(message["query_list"]), } return ProcessedMessage( action=action_type, data=[processed], )
def process_message(self, value, metadata=None) -> Optional[ProcessedMessage]: assert isinstance(value, dict) v_uuid = value.get("event_id") message = { "org_id": value.get("org_id", 0), "project_id": value.get("project_id", 0), "key_id": value.get("key_id"), "timestamp": _ensure_valid_date( datetime.strptime(value["timestamp"], settings.PAYLOAD_DATETIME_FORMAT), ), "outcome": value["outcome"], "reason": _unicodify(value.get("reason")), "event_id": str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } return ProcessedMessage( action=ProcessorAction.INSERT, data=[message], )
def process_message(self, message, metadata=None) -> Optional[ProcessedMessage]: # some old relays accidentally emit rows without release if message["release"] is None: return None if message["duration"] is None: duration = None else: duration = _collapse_uint32(int(message["duration"] * 1000)) # since duration is not nullable, the max duration means no duration if duration is None: duration = MAX_UINT32 processed = { "session_id": str(uuid.UUID(message["session_id"])), "distinct_id": str(uuid.UUID(message.get("distinct_id") or NIL_UUID)), "seq": message["seq"], "org_id": message["org_id"], "project_id": message["project_id"], "retention_days": message["retention_days"], "duration": duration, "status": STATUS_MAPPING[message["status"]], "errors": _collapse_uint16(message["errors"]) or 0, "received": _ensure_valid_date( datetime.utcfromtimestamp(message["received"]) ), "started": _ensure_valid_date( datetime.utcfromtimestamp(message["started"]) ), "release": message["release"], "environment": message.get("environment") or "", } return ProcessedMessage(action=ProcessorAction.INSERT, data=[processed])
def test_v2_end_delete_tag(self): project_id = 1 message = (2, 'end_delete_tag', {'project_id': project_id}) processor = enforce_table_writer( self.dataset).get_stream_loader().get_processor() assert processor.process_message(message) == \ ProcessedMessage( action=ProcessorAction.REPLACE, data=[(str(project_id), message)], )
def process_message(self, value, metadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) partition = metadata.partition assert ( partition == KAFKA_ONLY_PARTITION ), "CDC can only work with single partition topics for consistency" offset = metadata.offset event = value["event"] if event == "begin": messages = self._process_begin(offset) elif event == "commit": messages = self._process_commit(offset) elif event == "change": table_name = value["table"] if table_name != self.pg_table: return None operation = value["kind"] if operation == "insert": messages = self._process_insert( offset, value["columnnames"], value["columnvalues"] ) elif operation == "update": messages = self._process_update( offset, value["oldkeys"], value["columnnames"], value["columnvalues"], ) elif operation == "delete": messages = self._process_delete(offset, value["oldkeys"]) else: raise ValueError( "Invalid value for operation in replication log: %s" % value["kind"] ) else: raise ValueError( "Invalid value for event in replication log: %s" % value["event"] ) if not messages: return None return ProcessedMessage(action=ProcessorAction.INSERT, data=messages,)
def process_message(self, value, metadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) partition = metadata.partition assert partition == KAFKA_ONLY_PARTITION, 'CDC can only work with single partition topics for consistency' offset = metadata.offset event = value['event'] if event == 'begin': messages = self._process_begin(offset) elif event == 'commit': messages = self._process_commit(offset) elif event == 'change': table_name = value['table'] if table_name != self.pg_table: return None operation = value['kind'] if operation == 'insert': messages = self._process_insert(offset, value['columnnames'], value['columnvalues']) elif operation == 'update': messages = self._process_update(offset, value['oldkeys'], value['columnnames'], value['columnvalues']) elif operation == 'delete': messages = self._process_delete(offset, value['oldkeys']) else: raise ValueError( "Invalid value for operation in replication log: %s" % value['kind']) else: raise ValueError("Invalid value for event in replication log: %s" % value['event']) if not messages: return None return ProcessedMessage( action=ProcessorAction.INSERT, data=messages, )
def process_message(self, message, metadata=None) -> Optional[ProcessedMessage]: """\ Process a raw message into a tuple of (action_type, processed_message): * action_type: one of the sentinel values INSERT or REPLACE * processed_message: dict representing the processed column -> value(s) Returns `None` if the event is too old to be written. """ action_type = None if isinstance(message, dict): # deprecated unwrapped event message == insert action_type = ProcessorAction.INSERT try: processed = self.process_insert(message, metadata) except EventTooOld: return None elif isinstance(message, (list, tuple)) and len(message) >= 2: version = message[0] if version in (0, 1, 2): # version 0: (0, 'insert', data) # version 1: (1, type, data, [state]) # NOTE: types 'delete_groups', 'merge' and 'unmerge' are ignored # version 2: (2, type, data, [state]) type_, event = message[1:3] if type_ == 'insert': action_type = ProcessorAction.INSERT try: processed = self.process_insert(event, metadata) except EventTooOld: return None else: if version == 0: raise InvalidMessageType( "Invalid message type: {}".format(type_)) elif version == 1: if type_ in ('delete_groups', 'merge', 'unmerge'): # these didn't contain the necessary data to handle replacements return None else: raise InvalidMessageType( "Invalid message type: {}".format(type_)) elif version == 2: # we temporarily sent these invalid message types from Sentry if type_ in ('delete_groups', 'merge'): return None if type_ in ('start_delete_groups', 'start_merge', 'start_unmerge', 'start_delete_tag', 'end_delete_groups', 'end_merge', 'end_unmerge', 'end_delete_tag'): # pass raw events along to republish action_type = ProcessorAction.REPLACE processed = (str(event['project_id']), message) else: raise InvalidMessageType( "Invalid message type: {}".format(type_)) if action_type is None: raise InvalidMessageVersion("Unknown message format: " + str(message)) if processed is None: return None return ProcessedMessage( action=action_type, data=[processed], )
def test_simple(): request_body = { "selected_columns": ["event_id"], "orderby": "event_id", "sample": 0.1, "limit": 100, "offset": 50, "project": 1, } query = Query( request_body, get_storage( "events").get_schemas().get_read_schema().get_data_source(), ) request = Request( uuid.UUID("a" * 32).hex, query, HTTPRequestSettings(), {}, "search") time = TestingClock() timer = Timer("test", clock=time) time.sleep(0.01) message = SnubaQueryMetadata( request=request, dataset=get_dataset("events"), timer=timer, query_list=[ ClickhouseQueryMetadata( sql= "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100", stats={"sample": 10}, status="success", trace_id="b" * 32) ]).to_dict() processor = (enforce_table_writer( get_dataset("querylog")).get_stream_loader().get_processor()) assert processor.process_message( message ) == ProcessedMessage(ProcessorAction.INSERT, [{ "request_id": str(uuid.UUID("a" * 32)), "request_body": '{"limit": 100, "offset": 50, "orderby": "event_id", "project": 1, "sample": 0.1, "selected_columns": ["event_id"]}', "referrer": "search", "dataset": get_dataset("events"), "projects": [1], "organization": None, "timestamp": timer.for_json()["timestamp"], "duration_ms": 10, "status": "success", "clickhouse_queries.sql": [ "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100" ], "clickhouse_queries.status": ["success"], "clickhouse_queries.trace_id": [str(uuid.UUID("b" * 32))], "clickhouse_queries.duration_ms": [0], "clickhouse_queries.stats": ['{"sample": 10}'], "clickhouse_queries.final": [0], "clickhouse_queries.cache_hit": [0], "clickhouse_queries.sample": [10.], "clickhouse_queries.max_threads": [0], "clickhouse_queries.num_days": [0], "clickhouse_queries.clickhouse_table": [""], "clickhouse_queries.query_id": [""], "clickhouse_queries.is_duplicate": [0], "clickhouse_queries.consistent": [0], }])
class TestSnapshotWorker: test_data = [ ( INSERT_MSG % { "xid": 90 }, None, ), ( INSERT_MSG % { "xid": 100 }, None, ), ( INSERT_MSG % { "xid": 110 }, None, ), ( INSERT_MSG % { "xid": 120 }, ProcessedMessage( action=ProcessorAction.INSERT, data=[PROCESSED], ), ), ( INSERT_MSG % { "xid": 210 }, ProcessedMessage( action=ProcessorAction.INSERT, data=[PROCESSED], ), ), ] @pytest.mark.parametrize("value, expected", test_data) def test_send_message( self, value: str, expected: Optional[ProcessedMessage], ) -> None: storage = get_storage("groupedmessages") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( storage=storage, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 1, KafkaPayload( None, value.encode("utf-8"), [("table", "sentry_groupedmessage".encode())], ), datetime.now(), ) ret = worker.process_message(message) assert ret == expected
def process_message(self, message, metadata=None) -> Optional[ProcessedMessage]: action_type = ProcessorAction.INSERT processed = {"deleted": 0} if not (isinstance(message, (list, tuple)) and len(message) >= 2): return None version = message[0] if version not in (0, 1, 2): return None type_, event = message[1:3] if type_ != "insert": return None data = event["data"] event_type = data.get("type") if event_type != "transaction": return None extract_base(processed, event) processed["retention_days"] = enforce_retention( event, datetime.fromtimestamp(data["timestamp"]), ) if not data.get("contexts", {}).get("trace"): return None transaction_ctx = data["contexts"]["trace"] trace_id = transaction_ctx["trace_id"] try: processed["event_id"] = str(uuid.UUID(processed["event_id"])) processed["trace_id"] = str(uuid.UUID(trace_id)) processed["span_id"] = int(transaction_ctx["span_id"], 16) processed["transaction_op"] = _unicodify( transaction_ctx.get("op", "")) processed["transaction_name"] = _unicodify(data["transaction"]) processed[ "start_ts"], processed["start_ms"] = self.__extract_timestamp( data["start_timestamp"], ) status = transaction_ctx.get("status", None) if status: int_status = SPAN_STATUS_NAME_TO_CODE.get( status, UNKNOWN_SPAN_STATUS) else: int_status = UNKNOWN_SPAN_STATUS processed["transaction_status"] = int_status if data["timestamp"] - data["start_timestamp"] < 0: # Seems we have some negative durations in the DB metrics.increment("negative_duration") except Exception: # all these fields are required but we saw some events go through here # in the past. For now bail. return processed["finish_ts"], processed[ "finish_ms"] = self.__extract_timestamp(data["timestamp"], ) duration_secs = (processed["finish_ts"] - processed["start_ts"]).total_seconds() processed["duration"] = max(int(duration_secs * 1000), 0) processed["platform"] = _unicodify(event["platform"]) tags = _as_dict_safe(data.get("tags", None)) processed["tags.key"], processed["tags.value"] = extract_extra_tags( tags) processed["_tags_flattened"] = flatten_nested_field( processed["tags.key"], processed["tags.value"]) promoted_tags = { col: tags[col] for col in self.PROMOTED_TAGS if col in tags } processed["release"] = promoted_tags.get( "sentry:release", event.get("release"), ) processed["environment"] = promoted_tags.get("environment") contexts = _as_dict_safe(data.get("contexts", None)) user_dict = data.get("user", data.get("sentry.interfaces.User", None)) or {} geo = user_dict.get("geo", None) or {} if "geo" not in contexts and isinstance(geo, dict): contexts["geo"] = geo processed["contexts.key"], processed[ "contexts.value"] = extract_extra_contexts(contexts) processed["_contexts_flattened"] = flatten_nested_field( processed["contexts.key"], processed["contexts.value"]) processed["dist"] = _unicodify( promoted_tags.get("sentry:dist", data.get("dist")), ) user_data = {} extract_user(user_data, user_dict) processed["user"] = promoted_tags.get("sentry:user", "") processed["user_name"] = user_data["username"] processed["user_id"] = user_data["user_id"] processed["user_email"] = user_data["email"] ip_address = _ensure_valid_ip(user_data["ip_address"]) if ip_address: if ip_address.version == 4: processed["ip_address_v4"] = str(ip_address) elif ip_address.version == 6: processed["ip_address_v6"] = str(ip_address) if metadata is not None: processed["partition"] = metadata.partition processed["offset"] = metadata.offset sdk = data.get("sdk", None) or {} processed["sdk_name"] = _unicodify(sdk.get("name", "")) processed["sdk_version"] = _unicodify(sdk.get("version", "")) if processed["sdk_name"] == "": metrics.increment("missing_sdk_name") if processed["sdk_version"] == "": metrics.increment("missing_sdk_version") return ProcessedMessage( action=action_type, data=[processed], )
def process_message(self, message, metadata=None) -> Optional[ProcessedMessage]: action_type = ProcessorAction.INSERT processed = {'deleted': 0} if not (isinstance(message, (list, tuple)) and len(message) >= 2): return None version = message[0] if version not in (0, 1, 2): return None type_, event = message[1:3] if type_ != 'insert': return None data = event["data"] event_type = data.get("type") if event_type != "transaction": return None extract_base(processed, event) processed["retention_days"] = enforce_retention( event, datetime.fromtimestamp(data['timestamp']), ) transaction_ctx = data["contexts"]["trace"] trace_id = transaction_ctx["trace_id"] processed["event_id"] = str(uuid.UUID(processed["event_id"])) processed["trace_id"] = str(uuid.UUID(trace_id)) processed["span_id"] = int(transaction_ctx["span_id"], 16) processed["transaction_op"] = _unicodify(transaction_ctx.get("op", "")) processed["transaction_name"] = _unicodify(data["transaction"]) processed["start_ts"], processed[ "start_ms"] = self.__extract_timestamp(data["start_timestamp"], ) processed["finish_ts"], processed[ "finish_ms"] = self.__extract_timestamp(data["timestamp"], ) processed['platform'] = _unicodify(event['platform']) tags = _as_dict_safe(data.get('tags', None)) extract_extra_tags(processed, tags) promoted_tags = { col: tags[col] for col in self.PROMOTED_TAGS if col in tags } processed["release"] = promoted_tags.get( "sentry:release", event.get("release"), ) processed["environment"] = promoted_tags.get("environment") contexts = _as_dict_safe(data.get('contexts', None)) extract_extra_contexts(processed, contexts) processed["dist"] = _unicodify( promoted_tags.get("sentry:dist", data.get("dist")), ) user_data = {} extract_user(user_data, data.get("user", {})) processed["user"] = promoted_tags.get("sentry:user", "") processed["user_name"] = user_data["username"] processed["user_id"] = user_data["user_id"] processed["user_email"] = user_data["email"] ip_address = _ensure_valid_ip(user_data["ip_address"]) if ip_address: if ip_address.version == 4: processed["ip_address_v4"] = str(ip_address) elif ip_address.version == 6: processed["ip_address_v6"] = str(ip_address) if metadata is not None: processed['partition'] = metadata.partition processed['offset'] = metadata.offset return ProcessedMessage( action=action_type, data=[processed], )
class TestSnapshotWorker: test_data = [ ( INSERT_MSG % { "xid": 90 }, None, ), ( INSERT_MSG % { "xid": 100 }, None, ), ( INSERT_MSG % { "xid": 110 }, None, ), (INSERT_MSG % { "xid": 120 }, ProcessedMessage( action=ProcessorAction.INSERT, data=[PROCESSED], )), (INSERT_MSG % { "xid": 210 }, ProcessedMessage( action=ProcessorAction.INSERT, data=[PROCESSED], )) ] @pytest.mark.parametrize("message, expected", test_data) def test_send_message( self, message: str, expected: Optional[ProcessedMessage], ) -> None: dataset = get_dataset("groupedmessage") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( dataset=dataset, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) ret = worker.process_message( KafkaMessage( TopicPartition('topic', 0), 1, message.encode('utf-8'), )) assert ret == expected
def process_message(self, message, metadata=None) -> Optional[ProcessedMessage]: action_type = ProcessorAction.INSERT processed = {'deleted': 0} if not (isinstance(message, (list, tuple)) and len(message) >= 2): return None version = message[0] if version not in (0, 1, 2): return None type_, event = message[1:3] if type_ != 'insert': return None data = event["data"] event_type = data.get("type") if event_type != "transaction": return None extract_base(processed, event) processed["retention_days"] = enforce_retention( event, datetime.fromtimestamp(data['timestamp']), ) if not data.get('contexts', {}).get('trace'): return None transaction_ctx = data["contexts"]["trace"] trace_id = transaction_ctx["trace_id"] try: processed["event_id"] = str(uuid.UUID(processed["event_id"])) processed["trace_id"] = str(uuid.UUID(trace_id)) processed["span_id"] = int(transaction_ctx["span_id"], 16) processed["transaction_op"] = _unicodify( transaction_ctx.get("op", "")) processed["transaction_name"] = _unicodify(data["transaction"]) processed[ "start_ts"], processed["start_ms"] = self.__extract_timestamp( data["start_timestamp"], ) if data["timestamp"] - data["start_timestamp"] < 0: # Seems we have some negative durations in the DB metrics.increment('negative_duration') except Exception: # all these fields are required but we saw some events go through here # in the past. For now bail. return processed["finish_ts"], processed[ "finish_ms"] = self.__extract_timestamp(data["timestamp"], ) duration_secs = (processed["finish_ts"] - processed["start_ts"]).total_seconds() processed['duration'] = max(int(duration_secs * 1000), 0) processed['platform'] = _unicodify(event['platform']) tags = _as_dict_safe(data.get('tags', None)) extract_extra_tags(processed, tags) promoted_tags = { col: tags[col] for col in self.PROMOTED_TAGS if col in tags } processed["release"] = promoted_tags.get( "sentry:release", event.get("release"), ) processed["environment"] = promoted_tags.get("environment") contexts = _as_dict_safe(data.get('contexts', None)) extract_extra_contexts(processed, contexts) processed["dist"] = _unicodify( promoted_tags.get("sentry:dist", data.get("dist")), ) user_data = {} extract_user(user_data, data.get("user", {})) processed["user"] = promoted_tags.get("sentry:user", "") processed["user_name"] = user_data["username"] processed["user_id"] = user_data["user_id"] processed["user_email"] = user_data["email"] ip_address = _ensure_valid_ip(user_data["ip_address"]) if ip_address: if ip_address.version == 4: processed["ip_address_v4"] = str(ip_address) elif ip_address.version == 6: processed["ip_address_v6"] = str(ip_address) if metadata is not None: processed['partition'] = metadata.partition processed['offset'] = metadata.offset return ProcessedMessage( action=action_type, data=[processed], )