def test_process_messages(mock_indexer, mock_task) -> None: message_payloads = [counter_payload, distribution_payload, set_payload] message_batch = [ Message( Partition(Topic("topic"), 0), i + 1, KafkaPayload(None, json.dumps(payload).encode("utf-8"), []), datetime.now(), ) for i, payload in enumerate(message_payloads) ] # the outer message uses the last message's partition, offset, and timestamp last = message_batch[-1] outer_message = Message(last.partition, last.offset, message_batch, last.timestamp) new_batch = process_messages(outer_message=outer_message) expected_new_batch = [ Message( m.partition, m.offset, KafkaPayload( None, json.dumps(__translated_payload( message_payloads[i])).encode("utf-8"), [], ), m.timestamp, ) for i, m in enumerate(message_batch) ] assert new_batch == expected_new_batch
def test_table_name_filter() -> None: table_name = "table_name" message_filter = CdcTableNameMessageFilter(table_name) # Messages that math the table should not be dropped. assert not message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]), datetime.now(), )) # Messages without a table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", []), datetime.now(), )) # Messages from a different table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", b"other_table")]), datetime.now(), ))
def test_multistorage_strategy( processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], ) -> None: from snuba.datasets.storages import groupassignees, groupedmessages from tests.datasets.cdc.test_groupassignee import TestGroupassignee from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage commit = Mock() storages = [groupassignees.storage, groupedmessages.storage] strategy = MultistorageConsumerProcessingStrategyFactory( storages, 10, 10, processes, input_block_size, output_block_size, TestingMetricsBackend(), ).create(commit) payloads = [ KafkaPayload(None, b"{}", [("table", b"ignored")]), KafkaPayload( None, json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"), [("table", groupassignees.storage.get_postgres_table().encode("utf8"))], ), KafkaPayload( None, json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"), [("table", groupedmessages.storage.get_postgres_table().encode("utf8"))], ), ] messages = [ Message( Partition(Topic("topic"), 0), offset, payload, datetime.now(), offset + 1 ) for offset, payload in enumerate(payloads) ] with assert_changes( lambda: get_row_count(groupassignees.storage), 0, 1 ), assert_changes(lambda: get_row_count(groupedmessages.storage), 0, 1): for message in messages: strategy.submit(message) with assert_changes( lambda: commit.call_args_list, [], [call({Partition(Topic("topic"), 0): 3})] ): strategy.close() strategy.join()
def _batch_message_set_up(next_step: Mock, max_batch_time: float = 100.0, max_batch_size: int = 2): # batch time is in seconds batch_messages_step = BatchMessages(next_step=next_step, max_batch_time=max_batch_time, max_batch_size=max_batch_size) message1 = Message(Partition(Topic("topic"), 0), 1, KafkaPayload(None, b"some value", []), datetime.now()) message2 = Message(Partition(Topic("topic"), 0), 2, KafkaPayload(None, b"another value", []), datetime.now()) return (batch_messages_step, message1, message2)
def test_delete_tag_promoted_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["data"]["tags"].append(["browser.name", "foo"]) self.event["data"]["tags"].append(["notbrowser", "foo"]) write_unprocessed_events(self.storage, [self.event]) project_id = self.project_id def _issue_count(total: bool = False) -> Sequence[Mapping[str, Any]]: clickhouse = self.storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY) total_cond = ( "AND has(_tags_hash_map, cityHash64('browser.name=foo'))" if not total else "") data = clickhouse.execute(f""" SELECT group_id, count() FROM errors_local FINAL WHERE deleted = 0 AND project_id = {project_id} {total_cond} GROUP BY group_id """).results return [{"group_id": row[0], "count": row[1]} for row in data] assert _issue_count() == [{"count": 1, "group_id": 1}] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_DELETE_TAG, { "project_id": project_id, "tag": "browser.name", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) assert processed is not None self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
def _wrap(self, msg: Tuple[Any, ...]) -> Message[KafkaPayload]: return Message( Partition(Topic("replacements"), 0), 0, KafkaPayload(None, json.dumps(msg).encode("utf-8"), []), datetime.now(), )
def test_offset_already_processed(self) -> None: """ Don't process an offset that already exists in Redis. """ set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) key = f"replacement:{CONSUMER_GROUP}:errors:1" redis_client.set(key, 42) old_offset: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 41, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, {}, )).encode("utf-8"), [], ), datetime.now(), ) same_offset: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, {}, )).encode("utf-8"), [], ), datetime.now(), ) assert self.replacer.process_message(old_offset) is None assert self.replacer.process_message(same_offset) is None
def test_metrics_batch_builder(): max_batch_time = 3.0 # seconds max_batch_size = 2 # 1. Ready when max_batch_size is reached batch_builder_size = MetricsBatchBuilder(max_batch_size=max_batch_size, max_batch_time=max_batch_time) assert not batch_builder_size.ready() message1 = Message(Partition(Topic("topic"), 0), 1, KafkaPayload(None, b"some value", []), datetime.now()) batch_builder_size.append(message1) assert not batch_builder_size.ready() message2 = Message(Partition(Topic("topic"), 0), 2, KafkaPayload(None, b"another value", []), datetime.now()) batch_builder_size.append(message2) assert batch_builder_size.ready() # 2. Ready when max_batch_time is reached batch_builder_time = MetricsBatchBuilder(max_batch_size=max_batch_size, max_batch_time=max_batch_time) assert not batch_builder_time.ready() message1 = Message(Partition(Topic("topic"), 0), 1, KafkaPayload(None, b"some value", []), datetime.now()) batch_builder_time.append(message1) assert not batch_builder_time.ready() time.sleep(3) assert batch_builder_time.ready() # 3. Adding the same message twice to the same batch batch_builder_time = MetricsBatchBuilder(max_batch_size=max_batch_size, max_batch_time=max_batch_time) message1 = Message(Partition(Topic("topic"), 0), 1, KafkaPayload(None, b"some value", []), datetime.now()) batch_builder_time.append(message1) with pytest.raises(DuplicateMessage): batch_builder_time.append(message1)
def eventstream(*, dataset: Dataset) -> RespTuple: record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_default_entity().get_writable_storage() assert storage is not None if type_ == "insert": from arroyo.processing.strategies.streaming import ( KafkaConsumerStrategyFactory, ) from snuba.consumers.consumer import build_batch_writer, process_message table_writer = storage.get_table_writer() stream_loader = table_writer.get_stream_loader() strategy = KafkaConsumerStrategyFactory( stream_loader.get_pre_filter(), functools.partial( process_message, stream_loader.get_processor(), "consumer_grouup" ), build_batch_writer(table_writer, metrics=metrics), max_batch_size=1, max_batch_time=1.0, processes=None, input_block_size=None, output_block_size=None, ).create(lambda offsets: None) strategy.submit(message) strategy.close() strategy.join() else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, "consumer_group", metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def __format_payload( self, message: Message[Tuple[StorageKey, Union[None, BytesInsertBatch, ReplacementBatch]]], ) -> List[KafkaPayload]: kafka_payloads: List[KafkaPayload] = [] storage_key, payload = message.payload if isinstance(payload, BytesInsertBatch): for row in payload.rows: kafka_payloads.append( KafkaPayload(storage_key.value.encode("utf-8"), row, [])) return kafka_payloads
def test_commit_log_consumer() -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() configuration = get_default_kafka_configuration() consumer: KafkaConsumer = KafkaConsumerWithCommitLog( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) producer = KafkaProducer(configuration) topic = Topic("topic") with closing(consumer) as consumer: with closing(producer) as producer: producer.produce(topic, next(get_payloads())).result(5.0) consumer.subscribe([topic]) message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) now = datetime.now() position = Position(message.next_offset, now) consumer.stage_positions({message.partition: position}) assert consumer.commit_positions() == {Partition(topic, 0): position} assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert commit_codec.decode( KafkaPayload( commit_message.key(), commit_message.value(), commit_message.headers(), )) == Commit("test", Partition(topic, 0), message.next_offset, now)
def test_multiple_partitions(self) -> None: """ Different partitions should have independent offset checks. """ set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) payload = KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ) offset = 42 timestamp = datetime.now() partition_one: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), offset, payload, timestamp, ) partition_two: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 2), offset, payload, timestamp, ) processed = self.replacer.process_message(partition_one) self.replacer.flush_batch([processed]) # different partition should be unaffected even if it's the same offset assert self.replacer.process_message(partition_two) is not None
def test_kafka_filter_header_with_bypass() -> None: header_filter = KafkaHeaderFilterWithBypass("should_drop", "1", 5) message = Message( Partition(Topic("random"), 1), 1, KafkaPayload(b"key", b"value", [("should_drop", b"1")]), datetime.now(), ) for _ in range(3): assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is False
def test_skip_kafka_message(self) -> None: state.set_config("kafka_messages_to_skip", "[snuba-test-lol:1:2,snuba-test-yeet:0:1]") assert skip_kafka_message( Message( Partition(Topic("snuba-test-lol"), 1), 2, KafkaPayload(None, b"", []), datetime.now(), )) assert skip_kafka_message( Message( Partition(Topic("snuba-test-yeet"), 0), 1, KafkaPayload(None, b"", []), datetime.now(), )) assert not skip_kafka_message( Message( Partition(Topic("snuba-test-lol"), 2), 1, KafkaPayload(None, b"", []), datetime.now(), ))
def test_unmerge_hierarchical_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "b" * 32 self.event["data"]["hierarchical_hashes"] = ["a" * 32] write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE_HIERARCHICAL, { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hierarchical_hash": "a" * 32, "primary_hash": "b" * 32, "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) assert processed is not None self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def encode(self, value: ScheduledSubscriptionTask) -> KafkaPayload: entity, subscription, tick_upper_offset = value.task return KafkaPayload( str(subscription.identifier).encode("utf-8"), cast( str, rapidjson.dumps( { "timestamp": value.timestamp.isoformat(), "entity": entity.value, "task": {"data": subscription.data.to_dict()}, "tick_upper_offset": tick_upper_offset, } ), ).encode("utf-8"), [], )
def encode(self, value: SubscriptionTaskResult) -> KafkaPayload: subscription_id = str(value.task.task.identifier) request, result = value.result return KafkaPayload( subscription_id.encode("utf-8"), json.dumps({ "version": 2, "payload": { "subscription_id": subscription_id, "request": { **request.body }, "result": result, "timestamp": value.task.timestamp.isoformat(), }, }).encode("utf-8"), [], )
def test_unmerge_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_unmerge", { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def test_delete_groups_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.utcnow() project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_DELETE_GROUPS, { "project_id": project_id, "group_ids": [1], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [] # Count is still zero after Redis flushed and parts merged self._clear_redis_and_force_merge() assert self._issue_count(self.project_id) == []
def test_reset_consumer_group_offset_check(self) -> None: set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) self.replacer.flush_batch([self.replacer.process_message(message)]) set_config(replacer.RESET_CHECK_CONFIG, f"[{CONSUMER_GROUP}]") # Offset to check against should be reset so this message shouldn't be skipped assert self.replacer.process_message(message) is not None
def test_process_offset_twice(self) -> None: set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # should be None since the offset should be in Redis, indicating it should be skipped assert self.replacer.process_message(message) is None
def test_invalid_commit_log_message(caplog: Any) -> None: clock = TestingClock() broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock) topic = Topic("messages") followed_consumer_group = "events" partition = Partition(topic, 0) broker.create_topic(topic, partitions=1) producer = broker.get_producer() inner_consumer = broker.get_consumer("group") consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group) def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert inner_consumer.tell() == {partition: 0} assert consumer.tell() == {partition: 0} assignment_callback = mock.Mock(side_effect=_assignment_callback) consumer.subscribe([topic], on_assign=assignment_callback) # produce invalid payload to commit log topic (key should not be None) producer.produce( partition, KafkaPayload(None, b"some-value", []), ).result() clock.sleep(1) with caplog.at_level(logging.ERROR): assert consumer.poll() is None assert followed_consumer_group in caplog.text
def test_produce_step() -> None: topic = Topic("snuba-metrics") partition = Partition(topic, 0) clock = Clock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(topic, partitions=1) producer = broker.get_producer() commit = Mock() produce_step = ProduceStep(commit_function=commit, producer=producer) message_payloads = [counter_payload, distribution_payload, set_payload] message_batch = [ Message( Partition(Topic("topic"), 0), i + 1, KafkaPayload( None, json.dumps(__translated_payload( message_payloads[i])).encode("utf-8"), []), datetime.now(), ) for i, payload in enumerate(message_payloads) ] # the outer message uses the last message's partition, offset, and timestamp last = message_batch[-1] outer_message = Message(last.partition, last.offset, message_batch, last.timestamp) # 1. Submit the message (that would have been generated from process_messages) produce_step.submit(outer_message=outer_message) # 2. Check that submit created the same number of futures as # messages in the outer_message (3 in this test). Also check # that the produced message payloads are as expected. assert len(produce_step._ProduceStep__futures) == 3 first_message = broker_storage.consume(partition, 0) assert first_message is not None second_message = broker_storage.consume(partition, 1) assert second_message is not None third_message = broker_storage.consume(partition, 2) assert third_message is not None assert broker_storage.consume(partition, 3) is None produced_messages = [ json.loads(msg.payload.value.decode("utf-8"), use_rapid_json=True) for msg in [first_message, second_message, third_message] ] expected_produced_messages = [] for payload in message_payloads: translated = __translated_payload(payload) tags: Mapping[str, int] = { str(k): v for k, v in translated["tags"].items() } translated.update(**{"tags": tags}) expected_produced_messages.append(translated) assert produced_messages == expected_produced_messages # 3. Call poll method, and check that doing so checked that # futures were ready and successful and therefore messages # were committed. produce_step.poll() expected_commit_calls = [ call({message.partition: Position(message.offset, message.timestamp)}) for message in message_batch ] assert commit.call_args_list == expected_commit_calls produce_step.close() produce_step.join()
import pytest from arroyo import Message, Partition, Topic from arroyo.backends.kafka import KafkaPayload from snuba.datasets.message_filters import ( KafkaHeaderFilter, KafkaHeaderFilterWithBypass, ) test_data = [ pytest.param( KafkaHeaderFilter("should_drop", "1"), Message( Partition(Topic("random"), 1), 1, KafkaPayload(b"key", b"value", [("should_drop", b"1")]), datetime.now(), ), True, id="matching-headers", ), pytest.param( KafkaHeaderFilter("should_drop", "0"), Message( Partition(Topic("random"), 1), 1, KafkaPayload(b"key", b"value", [("should_drop", b"1")]), datetime.now(), ), False, id="mismatched-headers",
def test_streaming_consumer_strategy() -> None: messages = ( Message( Partition(Topic("events"), 0), i, KafkaPayload(None, b"{}", []), datetime.now(), ) for i in itertools.count() ) replacements_producer = FakeConfluentKafkaProducer() processor = Mock() processor.process_message.side_effect = [ None, InsertBatch([{}], None), ReplacementBatch("key", [{}]), ] writer = Mock() metrics = TestingMetricsBackend() def write_step() -> ProcessedMessageBatchWriter: return ProcessedMessageBatchWriter( insert_batch_writer=InsertBatchWriter( writer, MetricsWrapper(metrics, "insertions") ), replacement_batch_writer=ReplacementBatchWriter( replacements_producer, Topic("replacements") ), ) factory = KafkaConsumerStrategyFactory( None, functools.partial(process_message, processor), write_step, max_batch_size=10, max_batch_time=60, processes=None, input_block_size=None, output_block_size=None, ) commit_function = Mock() strategy = factory.create(commit_function) for i in range(3): strategy.poll() strategy.submit(next(messages)) assert metrics.calls == [] processor.process_message.side_effect = [{}] with pytest.raises(TypeError): strategy.poll() strategy.submit(next(messages)) def get_number_of_insertion_metrics() -> int: count = 0 for c in metrics.calls: if isinstance(c, Timing) and c.name == "insertions.latency_ms": count += 1 return count expected_write_count = 1 with assert_changes( get_number_of_insertion_metrics, 0, expected_write_count ), assert_changes( lambda: writer.write.call_count, 0, expected_write_count ), assert_changes( lambda: len(replacements_producer.messages), 0, 1 ): strategy.close() strategy.join()
process_message_multistorage, process_message_multistorage_identical_storages, ) from snuba.datasets.storages import StorageKey from tests.fixtures import get_raw_event, get_raw_transaction test_data = [ pytest.param( Message( Partition(Topic("errors"), 1), 1, MultistorageKafkaPayload( [StorageKey.ERRORS, StorageKey.ERRORS_V2], KafkaPayload( None, json.dumps((2, "insert", get_raw_event())).encode("utf-8"), [], ), ), datetime.now(), ), True, id="both errors storage", ), pytest.param( Message( Partition(Topic("errors"), 1), 1, MultistorageKafkaPayload( [StorageKey.TRANSACTIONS, StorageKey.TRANSACTIONS_V2], KafkaPayload(
def get_payloads() -> Iterator[KafkaPayload]: for i in itertools.count(): yield KafkaPayload(None, f"{i}".encode("utf8"), [])
def test_reprocessing_flow_insert(self) -> None: # We have a group that contains two events, 1 and 2. self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["event_id"] = event_id = "00e24a150d7f4ee4b142b61b4d893b6d" write_unprocessed_events(self.storage, [self.event]) self.event["event_id"] = event_id2 = "00e24a150d7f4ee4b142b61b4d893b6e" write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 41, KafkaPayload( None, json.dumps(( 2, ReplacementType.TOMBSTONE_EVENTS, { "project_id": project_id, "event_ids": [event_id] }, )).encode("utf-8"), [], ), datetime.now(), ) # The user chooses to reprocess a subset of the group and throw away # the other events. Event 1 gets manually tombstoned by Sentry while # Event 2 prevails. processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # At this point the count doesn't make any sense but we don't care. assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] # The reprocessed event is inserted with a guaranteed-new group ID but # the *same* event ID (this is why we need to skip tombstoning this # event ID) self.event["group_id"] = 2 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.EXCLUDE_GROUPS, { "project_id": project_id, "group_ids": [1] }, )).encode("utf-8"), [], ), datetime.now(), ) # Group 1 is excluded from queries. At this point we have almost a # regular group deletion, except only a subset of events have been # tombstoned (the ones that will *not* be reprocessed). processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # Group 2 should contain the one event that the user chose to # reprocess, and Group 1 should be gone. (Note: In the product Group 2 # looks identical to Group 1, including short ID). assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }] assert self._get_group_id(project_id, event_id2) == 2 assert not self._get_group_id(project_id, event_id)
def process_messages(outer_message: Message[MessageBatch], ) -> MessageBatch: """ We have an outer_message Message() whose payload is a batch of Message() objects. Message( partition=..., offset=... timestamp=... payload=[Message(...), Message(...), etc] ) The inner messages payloads are KafkaPayload's that have: * key * headers * value The value of the message is what we need to parse and then translate using the indexer. """ indexer = get_indexer() metrics = get_metrics() org_strings = defaultdict(set) strings = set() with metrics.timer("process_messages.parse_outer_message"): parsed_payloads_by_offset = { msg.offset: json.loads(msg.payload.value.decode("utf-8"), use_rapid_json=True) for msg in outer_message.payload } for message in parsed_payloads_by_offset.values(): metric_name = message["name"] org_id = message["org_id"] tags = message.get("tags", {}) parsed_strings = { metric_name, *tags.keys(), *tags.values(), } org_strings[org_id].update(parsed_strings) strings.update(parsed_strings) metrics.incr("process_messages.total_strings_indexer_lookup", amount=len(strings)) with metrics.timer("metrics_consumer.bulk_record"): mapping = indexer.bulk_record(org_strings) new_messages: List[Message[KafkaPayload]] = [] with metrics.timer("process_messages.reconstruct_messages"): for message in outer_message.payload: parsed_payload_value = parsed_payloads_by_offset[message.offset] new_payload_value = deepcopy(parsed_payload_value) metric_name = parsed_payload_value["name"] tags = parsed_payload_value.get("tags", {}) try: new_tags: Mapping[int, int] = { mapping[k]: mapping[v] for k, v in tags.items() } except KeyError: logger.error("process_messages.key_error", extra={"tags": tags}, exc_info=True) continue new_payload_value["tags"] = new_tags new_payload_value["metric_id"] = mapping[metric_name] new_payload_value["retention_days"] = 90 del new_payload_value["name"] new_payload = KafkaPayload( key=message.payload.key, value=json.dumps(new_payload_value).encode(), headers=message.payload.headers, ) new_message = Message( partition=message.partition, offset=message.offset, payload=new_payload, timestamp=message.timestamp, ) new_messages.append(new_message) metrics.incr("metrics_consumer.process_message.messages_seen", amount=len(new_messages)) return new_messages
def invalid_message() -> Message[KafkaPayload]: invalid_payload = KafkaPayload(None, b"", []) return Message(Partition(Topic(""), 0), 0, invalid_payload, datetime.now())