def test_commit_log_consumer() -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() configuration = get_default_kafka_configuration() consumer: KafkaConsumer = KafkaConsumerWithCommitLog( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) producer = KafkaProducer(configuration) topic = Topic("topic") with closing(consumer) as consumer: with closing(producer) as producer: producer.produce(topic, next(get_payloads())).result(5.0) consumer.subscribe([topic]) message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) consumer.stage_offsets({message.partition: message.next_offset}) assert consumer.commit_offsets() == { Partition(topic, 0): message.next_offset } assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert commit_codec.decode( KafkaPayload( commit_message.key(), commit_message.value(), commit_message.headers(), )) == Commit("test", Partition(topic, 0), message.next_offset)
def eventstream(*, dataset: Dataset) -> RespTuple: record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_default_entity().get_writable_storage() assert storage is not None if type_ == "insert": from arroyo.processing.strategies.streaming import ( KafkaConsumerStrategyFactory, ) from snuba.consumers.consumer import build_batch_writer, process_message table_writer = storage.get_table_writer() stream_loader = table_writer.get_stream_loader() strategy = KafkaConsumerStrategyFactory( stream_loader.get_pre_filter(), functools.partial(process_message, stream_loader.get_processor()), build_batch_writer(table_writer, metrics=metrics), max_batch_size=1, max_batch_time=1.0, processes=None, input_block_size=None, output_block_size=None, ).create(lambda offsets: None) strategy.submit(message) strategy.close() strategy.join() else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def test_multiple_partitions(self) -> None: """ Different partitions should have independent offset checks. """ set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) payload = KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ) offset = 42 timestamp = datetime.now() partition_one: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), offset, payload, timestamp, ) partition_two: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 2), offset, payload, timestamp, ) processed = self.replacer.process_message(partition_one) self.replacer.flush_batch([processed]) # different partition should be unaffected even if it's the same offset assert self.replacer.process_message(partition_two) is not None
def test_kafka_filter_header_with_bypass() -> None: header_filter = KafkaHeaderFilterWithBypass("should_drop", "1", 5) message = Message( Partition(Topic("random"), 1), 1, KafkaPayload(b"key", b"value", [("should_drop", b"1")]), datetime.now(), ) for _ in range(3): assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is True assert header_filter.should_drop(message) is False
def test_skip_kafka_message(self) -> None: state.set_config("kafka_messages_to_skip", "[snuba-test-lol:1:2,snuba-test-yeet:0:1]") assert skip_kafka_message( Message( Partition(Topic("snuba-test-lol"), 1), 2, KafkaPayload(None, b"", []), datetime.now(), )) assert skip_kafka_message( Message( Partition(Topic("snuba-test-yeet"), 0), 1, KafkaPayload(None, b"", []), datetime.now(), )) assert not skip_kafka_message( Message( Partition(Topic("snuba-test-lol"), 2), 1, KafkaPayload(None, b"", []), datetime.now(), ))
def test_unmerge_hierarchical_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "b" * 32 self.event["data"]["hierarchical_hashes"] = ["a" * 32] write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE_HIERARCHICAL, { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hierarchical_hash": "a" * 32, "primary_hash": "b" * 32, "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) assert processed is not None self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def encode(self, value: ScheduledSubscriptionTask) -> KafkaPayload: entity, subscription, tick_upper_offset = value.task return KafkaPayload( str(subscription.identifier).encode("utf-8"), cast( str, rapidjson.dumps( { "timestamp": value.timestamp.isoformat(), "entity": entity.value, "task": {"data": subscription.data.to_dict()}, "tick_upper_offset": tick_upper_offset, } ), ).encode("utf-8"), [], )
def encode(self, value: SubscriptionTaskResult) -> KafkaPayload: subscription_id = str(value.task.task.identifier) request, result = value.result return KafkaPayload( subscription_id.encode("utf-8"), json.dumps({ "version": 2, "payload": { "subscription_id": subscription_id, "request": { **request.body }, "result": result, "timestamp": value.task.timestamp.isoformat(), }, }).encode("utf-8"), [], )
def test_unmerge_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_unmerge", { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def test_delete_groups_insert(self) -> None: self.event["project_id"] = self.project_id self.event["group_id"] = 1 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.utcnow() project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_DELETE_GROUPS, { "project_id": project_id, "group_ids": [1], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [] # Count is still zero after Redis flushed and parts merged self._clear_redis_and_force_merge() assert self._issue_count(self.project_id) == []
def test_reset_consumer_group_offset_check(self) -> None: set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) self.replacer.flush_batch([self.replacer.process_message(message)]) set_config(replacer.RESET_CHECK_CONFIG, f"[{CONSUMER_GROUP}]") # Offset to check against should be reset so this message shouldn't be skipped assert self.replacer.process_message(message) is not None
def test_process_offset_twice(self) -> None: set_config("skip_seen_offsets", True) self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.END_UNMERGE, { "project_id": self.project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # should be None since the offset should be in Redis, indicating it should be skipped assert self.replacer.process_message(message) is None
def test_invalid_commit_log_message(caplog: Any) -> None: clock = TestingClock() broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock) topic = Topic("messages") followed_consumer_group = "events" partition = Partition(topic, 0) broker.create_topic(topic, partitions=1) producer = broker.get_producer() inner_consumer = broker.get_consumer("group") consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group) def _assignment_callback(offsets: Mapping[Partition, int]) -> None: assert inner_consumer.tell() == {partition: 0} assert consumer.tell() == {partition: 0} assignment_callback = mock.Mock(side_effect=_assignment_callback) consumer.subscribe([topic], on_assign=assignment_callback) # produce invalid payload to commit log topic (key should not be None) producer.produce( partition, KafkaPayload(None, b"some-value", []), ).result() clock.sleep(1) with caplog.at_level(logging.ERROR): assert consumer.poll() is None assert followed_consumer_group in caplog.text
def process_messages(outer_message: Message[MessageBatch], ) -> MessageBatch: """ We have an outer_message Message() whose payload is a batch of Message() objects. Message( partition=..., offset=... timestamp=... payload=[Message(...), Message(...), etc] ) The inner messages payloads are KafkaPayload's that have: * key * headers * value The value of the message is what we need to parse and then translate using the indexer. """ indexer = get_indexer() metrics = get_metrics() org_strings = defaultdict(set) strings = set() with metrics.timer("process_messages.parse_outer_message"): parsed_payloads_by_offset = { msg.offset: json.loads(msg.payload.value.decode("utf-8"), use_rapid_json=True) for msg in outer_message.payload } for message in parsed_payloads_by_offset.values(): metric_name = message["name"] org_id = message["org_id"] tags = message.get("tags", {}) parsed_strings = { metric_name, *tags.keys(), *tags.values(), } org_strings[org_id].update(parsed_strings) strings.update(parsed_strings) metrics.incr("process_messages.total_strings_indexer_lookup", amount=len(strings)) with metrics.timer("metrics_consumer.bulk_record"): mapping = indexer.bulk_record(org_strings) new_messages: List[Message[KafkaPayload]] = [] with metrics.timer("process_messages.reconstruct_messages"): for message in outer_message.payload: parsed_payload_value = parsed_payloads_by_offset[message.offset] new_payload_value = deepcopy(parsed_payload_value) metric_name = parsed_payload_value["name"] tags = parsed_payload_value.get("tags", {}) try: new_tags: Mapping[int, int] = { mapping[k]: mapping[v] for k, v in tags.items() } except KeyError: logger.error("process_messages.key_error", extra={"tags": tags}, exc_info=True) continue new_payload_value["tags"] = new_tags new_payload_value["metric_id"] = mapping[metric_name] new_payload_value["retention_days"] = 90 del new_payload_value["name"] new_payload = KafkaPayload( key=message.payload.key, value=json.dumps(new_payload_value).encode(), headers=message.payload.headers, ) new_message = Message( partition=message.partition, offset=message.offset, payload=new_payload, timestamp=message.timestamp, ) new_messages.append(new_message) metrics.incr("metrics_consumer.process_message.messages_seen", amount=len(new_messages)) return new_messages
def test_produce_step() -> None: topic = Topic("snuba-metrics") partition = Partition(topic, 0) clock = Clock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(topic, partitions=1) producer = broker.get_producer() commit = Mock() produce_step = ProduceStep(commit_function=commit, producer=producer) message_payloads = [counter_payload, distribution_payload, set_payload] message_batch = [ Message( Partition(Topic("topic"), 0), i + 1, KafkaPayload( None, json.dumps(__translated_payload( message_payloads[i])).encode("utf-8"), []), datetime.now(), ) for i, payload in enumerate(message_payloads) ] # the outer message uses the last message's partition, offset, and timestamp last = message_batch[-1] outer_message = Message(last.partition, last.offset, message_batch, last.timestamp) # 1. Submit the message (that would have been generated from process_messages) produce_step.submit(outer_message=outer_message) # 2. Check that submit created the same number of futures as # messages in the outer_message (3 in this test). Also check # that the produced message payloads are as expected. assert len(produce_step._ProduceStep__futures) == 3 first_message = broker_storage.consume(partition, 0) assert first_message is not None second_message = broker_storage.consume(partition, 1) assert second_message is not None third_message = broker_storage.consume(partition, 2) assert third_message is not None assert broker_storage.consume(partition, 3) is None produced_messages = [ json.loads(msg.payload.value.decode("utf-8"), use_rapid_json=True) for msg in [first_message, second_message, third_message] ] expected_produced_messages = [] for payload in message_payloads: translated = __translated_payload(payload) tags: Mapping[str, int] = { str(k): v for k, v in translated["tags"].items() } translated.update(**{"tags": tags}) expected_produced_messages.append(translated) assert produced_messages == expected_produced_messages # 3. Call poll method, and check that doing so checked that # futures were ready and successful and therefore messages # were committed. produce_step.poll() expected_commit_calls = [ call({message.partition: Position(message.offset, message.timestamp)}) for message in message_batch ] assert commit.call_args_list == expected_commit_calls produce_step.close() produce_step.join()
import pytest from arroyo import Message, Partition, Topic from arroyo.backends.kafka import KafkaPayload from snuba.datasets.message_filters import ( KafkaHeaderFilter, KafkaHeaderFilterWithBypass, ) test_data = [ pytest.param( KafkaHeaderFilter("should_drop", "1"), Message( Partition(Topic("random"), 1), 1, KafkaPayload(b"key", b"value", [("should_drop", b"1")]), datetime.now(), ), True, id="matching-headers", ), pytest.param( KafkaHeaderFilter("should_drop", "0"), Message( Partition(Topic("random"), 1), 1, KafkaPayload(b"key", b"value", [("should_drop", b"1")]), datetime.now(), ), False, id="mismatched-headers",
def test_reprocessing_flow_insert(self) -> None: # We have a group that contains two events, 1 and 2. self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["event_id"] = event_id = "00e24a150d7f4ee4b142b61b4d893b6d" write_unprocessed_events(self.storage, [self.event]) self.event["event_id"] = event_id2 = "00e24a150d7f4ee4b142b61b4d893b6e" write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 41, KafkaPayload( None, json.dumps(( 2, ReplacementType.TOMBSTONE_EVENTS, { "project_id": project_id, "event_ids": [event_id] }, )).encode("utf-8"), [], ), datetime.now(), ) # The user chooses to reprocess a subset of the group and throw away # the other events. Event 1 gets manually tombstoned by Sentry while # Event 2 prevails. processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # At this point the count doesn't make any sense but we don't care. assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] # The reprocessed event is inserted with a guaranteed-new group ID but # the *same* event ID (this is why we need to skip tombstoning this # event ID) self.event["group_id"] = 2 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, ReplacementType.EXCLUDE_GROUPS, { "project_id": project_id, "group_ids": [1] }, )).encode("utf-8"), [], ), datetime.now(), ) # Group 1 is excluded from queries. At this point we have almost a # regular group deletion, except only a subset of events have been # tombstoned (the ones that will *not* be reprocessed). processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # Group 2 should contain the one event that the user chose to # reprocess, and Group 1 should be gone. (Note: In the product Group 2 # looks identical to Group 1, including short ID). assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }] assert self._get_group_id(project_id, event_id2) == 2 assert not self._get_group_id(project_id, event_id)
def test_streaming_consumer_strategy() -> None: messages = ( Message( Partition(Topic("events"), 0), i, KafkaPayload(None, b"{}", []), datetime.now(), ) for i in itertools.count() ) replacements_producer = FakeConfluentKafkaProducer() processor = Mock() processor.process_message.side_effect = [ None, InsertBatch([{}], None), ReplacementBatch("key", [{}]), ] writer = Mock() metrics = TestingMetricsBackend() def write_step() -> ProcessedMessageBatchWriter: return ProcessedMessageBatchWriter( insert_batch_writer=InsertBatchWriter( writer, MetricsWrapper(metrics, "insertions") ), replacement_batch_writer=ReplacementBatchWriter( replacements_producer, Topic("replacements") ), ) factory = KafkaConsumerStrategyFactory( None, functools.partial(process_message, processor), write_step, max_batch_size=10, max_batch_time=60, processes=None, input_block_size=None, output_block_size=None, ) commit_function = Mock() strategy = factory.create(commit_function) for i in range(3): strategy.poll() strategy.submit(next(messages)) assert metrics.calls == [] processor.process_message.side_effect = [{}] with pytest.raises(TypeError): strategy.poll() strategy.submit(next(messages)) def get_number_of_insertion_metrics() -> int: count = 0 for c in metrics.calls: if isinstance(c, Timing) and c.name == "insertions.latency_ms": count += 1 return count expected_write_count = 1 with assert_changes( get_number_of_insertion_metrics, 0, expected_write_count ), assert_changes( lambda: writer.write.call_count, 0, expected_write_count ), assert_changes( lambda: len(replacements_producer.messages), 0, 1 ): strategy.close() strategy.join()
process_message_multistorage, process_message_multistorage_identical_storages, ) from snuba.datasets.storages import StorageKey from tests.fixtures import get_raw_event, get_raw_transaction test_data = [ pytest.param( Message( Partition(Topic("errors"), 1), 1, MultistorageKafkaPayload( [StorageKey.ERRORS, StorageKey.ERRORS_V2], KafkaPayload( None, json.dumps((2, "insert", get_raw_event())).encode("utf-8"), [], ), ), datetime.now(), ), True, id="both errors storage", ), pytest.param( Message( Partition(Topic("errors"), 1), 1, MultistorageKafkaPayload( [StorageKey.TRANSACTIONS, StorageKey.TRANSACTIONS_V2], KafkaPayload(
def invalid_message() -> Message[KafkaPayload]: invalid_payload = KafkaPayload(None, b"", []) return Message(Partition(Topic(""), 0), 0, invalid_payload, datetime.now())
def get_payloads() -> Iterator[KafkaPayload]: for i in itertools.count(): yield KafkaPayload(None, f"{i}".encode("utf8"), [])