def revocation_callback(streams: Sequence[TopicPartition]): revocation_callback.called = True assert streams == [TopicPartition(topic, 0)] assert consumer.tell() == {TopicPartition(topic, 0): 1} # Not sure why you'd want to do this, but it shouldn't error. consumer.seek({TopicPartition(topic, 0): 0})
def test_batch_time(self, mock_time: Any) -> None: consumer = FakeKafkaConsumer() worker = FakeWorker() batching_consumer = BatchingKafkaConsumer( consumer, 'topic', worker=worker, max_batch_size=100, max_batch_time=2000, metrics=DummyMetricsBackend(strict=True), ) mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 0).timetuple()) consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [1, 2, 3]] for x in range(len(consumer.items)): batching_consumer._run_once() mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 1).timetuple()) consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [4, 5, 6]] for x in range(len(consumer.items)): batching_consumer._run_once() mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 5).timetuple()) consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [7, 8, 9]] for x in range(len(consumer.items)): batching_consumer._run_once() batching_consumer._shutdown() assert worker.processed == [b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9'] assert worker.flushed == [[b'1', b'2', b'3', b'4', b'5', b'6']] assert consumer.commit_calls == 1 assert consumer.close_calls == 1
def assignment_callback(streams: Sequence[TopicPartition]): assignment_callback.called = True assert streams == [TopicPartition(topic, 0)] assert consumer.tell() == {TopicPartition(topic, 0): 0} consumer.seek({TopicPartition(topic, 0): 1}) with pytest.raises(ConsumerError): consumer.seek({TopicPartition(topic, 1): 0})
def eventstream(dataset_name): dataset = get_dataset(dataset_name) ensure_table_exists(dataset) record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message = KafkaMessage( TopicPartition('topic', 0), 0, http_request.data, ) type_ = record[1] metrics = DummyMetricsBackend() if type_ == 'insert': from snuba.consumer import ConsumerWorker worker = ConsumerWorker(dataset, producer=None, replacements_topic=None, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ('ok', 200, {'Content-Type': 'text/plain'})
def test_send_message( self, message: str, expected: Optional[ProcessedMessage], ) -> None: dataset = get_dataset("groupedmessage") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( dataset=dataset, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) ret = worker.process_message( KafkaMessage( TopicPartition('topic', 0), 1, message.encode('utf-8'), )) assert ret == expected
def test_unmerge_insert(self): self.event['project_id'] = self.project_id self.event['group_id'] = 1 self.event['primary_hash'] = 'a' * 32 self.write_raw_events(self.event) assert self._issue_count(self.project_id) == [{'count': 1, 'issue': 1}] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message = KafkaMessage( TopicPartition('replacements', 1), 42, json.dumps((2, 'end_unmerge', { 'project_id': project_id, 'previous_group_id': 1, 'new_group_id': 2, 'hashes': ['a' * 32], 'datetime': timestamp.strftime(PAYLOAD_DATETIME_FORMAT), })).encode('utf-8'), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{'count': 1, 'issue': 2}]
def test_auto_offset_reset_latest(topic: str) -> None: producer = ConfluentProducer(configuration) value = uuid.uuid1().hex.encode("utf-8") producer.produce(topic, value=value) assert producer.flush(5.0) is 0 consumer = KafkaConsumer( { **configuration, "auto.offset.reset": "latest", "enable.auto.commit": "false", "enable.auto.offset.store": "true", "enable.partition.eof": "true", "group.id": "test-latest", } ) consumer.subscribe([topic]) try: consumer.poll(10.0) # XXX: getting the subcription is slow except EndOfStream as error: assert error.stream == TopicPartition(topic, 0) assert error.offset == 1 else: raise AssertionError('expected EndOfStream error') consumer.close()
def get_messages(events_file): "Create a fake Kafka message for each JSON event in the file." messages = [] raw_events = open(events_file).readlines() for raw_event in raw_events: messages.append( KafkaMessage(TopicPartition('events', 1), 0, raw_event.encode('utf-8')), ) return messages
def test_delete_tag_promoted_insert(self): self.event['project_id'] = self.project_id self.event['group_id'] = 1 self.event['data']['tags'].append(['browser.name', 'foo']) self.event['data']['tags'].append(['notbrowser', 'foo']) self.write_raw_events(self.event) project_id = self.project_id def _issue_count(total=False): return json.loads( self.app.post('/query', data=json.dumps({ 'project': [project_id], 'aggregations': [['count()', '', 'count']], 'conditions': [['tags[browser.name]', '=', 'foo']] if not total else [], 'groupby': ['issue'], })).data)['data'] assert _issue_count() == [{'count': 1, 'issue': 1}] assert _issue_count(total=True) == [{'count': 1, 'issue': 1}] timestamp = datetime.now(tz=pytz.utc) message = KafkaMessage( TopicPartition('replacements', 1), 42, json.dumps((2, 'end_delete_tag', { 'project_id': project_id, 'tag': 'browser.name', 'datetime': timestamp.strftime(PAYLOAD_DATETIME_FORMAT), })).encode('utf-8'), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{'count': 1, 'issue': 1}]
def test_batch_size(self) -> None: consumer = FakeKafkaConsumer() worker = FakeWorker() batching_consumer = BatchingKafkaConsumer( consumer, 'topic', worker=worker, max_batch_size=2, max_batch_time=100, metrics=DummyMetricsBackend(strict=True), ) consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [1, 2, 3]] for x in range(len(consumer.items)): batching_consumer._run_once() batching_consumer._shutdown() assert worker.processed == [b'1', b'2', b'3'] assert worker.flushed == [[b'1', b'2']] assert consumer.commit_calls == 1 assert consumer.close_calls == 1
def test_commit_log_consumer(topic: str) -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() consumer = KafkaConsumerWithCommitLog( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "true", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, commit_log_producer, 'commit-log', ) consumer.subscribe([topic]) producer = ConfluentProducer(configuration) producer.produce(topic) assert producer.flush(5.0) is 0 message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) assert consumer.commit() == {TopicPartition(topic, 0): message.offset + 1} assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == 'commit-log' assert commit_message.key() == '{}:{}:{}'.format(topic, 0, 'test').encode('utf-8') assert commit_message.value() == '{}'.format(message.offset + 1).encode( 'utf-8') # offsets are last processed message offset + 1
def test_skip_too_old(self): replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, FakeConfluentKafkaProducer(), replacement_topic.topic_name, self.metrics) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event['datetime'] = old_timestamp_str event['data']['datetime'] = old_timestamp_str event['data']['received'] = int( calendar.timegm(old_timestamp.timetuple())) message = KafkaMessage( TopicPartition('events', 1), 42, json.dumps((0, 'insert', event)).encode('utf-8'), ) assert test_worker.process_message(message) is None
def test_offsets(self): event = self.event message = KafkaMessage( TopicPartition('events', 456), 123, json.dumps((0, 'insert', event)).encode('utf-8') # event doesn't really matter ) replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, FakeConfluentKafkaProducer(), replacement_topic.topic_name, self.metrics) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table) == [(self.event['project_id'], self.event['event_id'], 123, 456)]
def test_consumer(topic: str) -> None: consumer = KafkaConsumer({ **configuration, "auto.offset.reset": "latest", "enable.auto.commit": "false", "enable.auto.offset.store": "true", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }) # TODO: It'd be much nicer if ``subscribe`` returned a future that we could # use to wait for assignment, but we'd need to be very careful to avoid # edge cases here. It's probably not worth the complexity for now. # XXX: There has got to be a better way to do this... assignment_callback = mock.MagicMock() revocation_callback = mock.MagicMock() consumer.subscribe([topic], on_assign=assignment_callback, on_revoke=revocation_callback) try: consumer.poll(10.0) # XXX: getting the subcription is slow except EndOfStream as error: assert error.stream == TopicPartition(topic, 0) assert error.offset == 0 else: raise AssertionError('expected EndOfStream error') assert assignment_callback.call_args_list == [ mock.call([TopicPartition(topic, 0)]) ] producer = ConfluentProducer(configuration) value = uuid.uuid1().hex.encode("utf-8") producer.produce(topic, value=value) assert producer.flush(5.0) is 0 message = consumer.poll(1.0) assert isinstance(message, Message) assert message.stream == TopicPartition(topic, 0) assert message.offset == 0 assert message.value == value try: assert consumer.poll(1.0) is None except EndOfStream as error: assert error.stream == TopicPartition(topic, 0) assert error.offset == 1 else: raise AssertionError('expected EndOfStream error') assert consumer.commit() == {TopicPartition(topic, 0): message.offset + 1} consumer.unsubscribe() assert consumer.poll(1.0) is None assert revocation_callback.call_args_list == [ mock.call([TopicPartition(topic, 0)]) ] consumer.close() with pytest.raises(RuntimeError): consumer.subscribe([topic]) with pytest.raises(RuntimeError): consumer.unsubscribe() with pytest.raises(RuntimeError): consumer.poll() with pytest.raises(RuntimeError): consumer.commit() consumer.close()
def _wrap(self, msg: str) -> KafkaMessage: return KafkaMessage( TopicPartition('replacements', 0), 0, json.dumps(msg).encode('utf-8'), )
def test_consumer(topic: str) -> None: def build_consumer() -> KafkaConsumer: return KafkaConsumer( { **configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "true", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, } ) producer = ConfluentProducer(configuration) value = uuid.uuid1().hex.encode("utf-8") for i in range(2): producer.produce(topic, value=value) assert producer.flush(5.0) is 0 consumer = build_consumer() def assignment_callback(streams: Sequence[TopicPartition]): assignment_callback.called = True assert streams == [TopicPartition(topic, 0)] assert consumer.tell() == {TopicPartition(topic, 0): 0} consumer.seek({TopicPartition(topic, 0): 1}) with pytest.raises(ConsumerError): consumer.seek({TopicPartition(topic, 1): 0}) def revocation_callback(streams: Sequence[TopicPartition]): revocation_callback.called = True assert streams == [TopicPartition(topic, 0)] assert consumer.tell() == {TopicPartition(topic, 0): 1} # Not sure why you'd want to do this, but it shouldn't error. consumer.seek({TopicPartition(topic, 0): 0}) # TODO: It'd be much nicer if ``subscribe`` returned a future that we could # use to wait for assignment, but we'd need to be very careful to avoid # edge cases here. It's probably not worth the complexity for now. consumer.subscribe([topic], on_assign=assignment_callback, on_revoke=revocation_callback) message = consumer.poll(10.0) # XXX: getting the subcription is slow assert isinstance(message, Message) assert message.stream == TopicPartition(topic, 0) assert message.offset == 1 assert message.value == value assert consumer.tell() == {TopicPartition(topic, 0): 2} assert getattr(assignment_callback, 'called', False) consumer.seek({TopicPartition(topic, 0): 0}) assert consumer.tell() == {TopicPartition(topic, 0): 0} with pytest.raises(ConsumerError): consumer.seek({TopicPartition(topic, 1): 0}) message = consumer.poll(1.0) assert isinstance(message, Message) assert message.stream == TopicPartition(topic, 0) assert message.offset == 0 assert message.value == value assert consumer.commit() == {TopicPartition(topic, 0): message.get_next_offset()} consumer.unsubscribe() assert consumer.poll(1.0) is None assert consumer.tell() == {} with pytest.raises(ConsumerError): consumer.seek({TopicPartition(topic, 0): 0}) consumer.close() with pytest.raises(RuntimeError): consumer.subscribe([topic]) with pytest.raises(RuntimeError): consumer.unsubscribe() with pytest.raises(RuntimeError): consumer.poll() with pytest.raises(RuntimeError): consumer.tell() with pytest.raises(RuntimeError): consumer.seek({TopicPartition(topic, 0): 0}) with pytest.raises(RuntimeError): consumer.commit() consumer.close() consumer = build_consumer() consumer.subscribe([topic]) message = consumer.poll(10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) assert message.stream == TopicPartition(topic, 0) assert message.offset == 1 assert message.value == value try: assert consumer.poll(1.0) is None except EndOfStream as error: assert error.stream == TopicPartition(topic, 0) assert error.offset == 2 else: raise AssertionError('expected EndOfStream error') consumer.close()