def test_batch_time(self, mock_time: Any) -> None: consumer = FakeKafkaConsumer() worker = FakeWorker() batching_consumer = BatchingKafkaConsumer( consumer, 'topic', worker=worker, max_batch_size=100, max_batch_time=2000, metrics=DummyMetricsBackend(strict=True), ) mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 0).timetuple()) consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [1, 2, 3]] for x in range(len(consumer.items)): batching_consumer._run_once() mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 1).timetuple()) consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [4, 5, 6]] for x in range(len(consumer.items)): batching_consumer._run_once() mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 5).timetuple()) consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [7, 8, 9]] for x in range(len(consumer.items)): batching_consumer._run_once() batching_consumer._shutdown() assert worker.processed == [b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9'] assert worker.flushed == [[b'1', b'2', b'3', b'4', b'5', b'6']] assert consumer.commit_calls == 1 assert consumer.close_calls == 1
def eventstream(dataset_name): dataset = get_dataset(dataset_name) ensure_table_exists(dataset) record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message = KafkaMessage( TopicPartition('topic', 0), 0, http_request.data, ) type_ = record[1] metrics = DummyMetricsBackend() if type_ == 'insert': from snuba.consumer import ConsumerWorker worker = ConsumerWorker(dataset, producer=None, replacements_topic=None, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ('ok', 200, {'Content-Type': 'text/plain'})
def test_unmerge_insert(self): self.event['project_id'] = self.project_id self.event['group_id'] = 1 self.event['primary_hash'] = 'a' * 32 self.write_raw_events(self.event) assert self._issue_count(self.project_id) == [{'count': 1, 'issue': 1}] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message = KafkaMessage( TopicPartition('replacements', 1), 42, json.dumps((2, 'end_unmerge', { 'project_id': project_id, 'previous_group_id': 1, 'new_group_id': 2, 'hashes': ['a' * 32], 'datetime': timestamp.strftime(PAYLOAD_DATETIME_FORMAT), })).encode('utf-8'), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{'count': 1, 'issue': 2}]
def test_send_message( self, message: str, expected: Optional[ProcessedMessage], ) -> None: dataset = get_dataset("groupedmessage") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( dataset=dataset, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) ret = worker.process_message( KafkaMessage( TopicPartition('topic', 0), 1, message.encode('utf-8'), )) assert ret == expected
def get_messages(events_file): "Create a fake Kafka message for each JSON event in the file." messages = [] raw_events = open(events_file).readlines() for raw_event in raw_events: messages.append( KafkaMessage(TopicPartition('events', 1), 0, raw_event.encode('utf-8')), ) return messages
def test_delete_tag_promoted_insert(self): self.event['project_id'] = self.project_id self.event['group_id'] = 1 self.event['data']['tags'].append(['browser.name', 'foo']) self.event['data']['tags'].append(['notbrowser', 'foo']) self.write_raw_events(self.event) project_id = self.project_id def _issue_count(total=False): return json.loads( self.app.post('/query', data=json.dumps({ 'project': [project_id], 'aggregations': [['count()', '', 'count']], 'conditions': [['tags[browser.name]', '=', 'foo']] if not total else [], 'groupby': ['issue'], })).data)['data'] assert _issue_count() == [{'count': 1, 'issue': 1}] assert _issue_count(total=True) == [{'count': 1, 'issue': 1}] timestamp = datetime.now(tz=pytz.utc) message = KafkaMessage( TopicPartition('replacements', 1), 42, json.dumps((2, 'end_delete_tag', { 'project_id': project_id, 'tag': 'browser.name', 'datetime': timestamp.strftime(PAYLOAD_DATETIME_FORMAT), })).encode('utf-8'), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{'count': 1, 'issue': 1}]
def test_batch_size(self) -> None: consumer = FakeKafkaConsumer() worker = FakeWorker() batching_consumer = BatchingKafkaConsumer( consumer, 'topic', worker=worker, max_batch_size=2, max_batch_time=100, metrics=DummyMetricsBackend(strict=True), ) consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [1, 2, 3]] for x in range(len(consumer.items)): batching_consumer._run_once() batching_consumer._shutdown() assert worker.processed == [b'1', b'2', b'3'] assert worker.flushed == [[b'1', b'2']] assert consumer.commit_calls == 1 assert consumer.close_calls == 1
def test_skip_too_old(self): replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, FakeConfluentKafkaProducer(), replacement_topic.topic_name, self.metrics) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event['datetime'] = old_timestamp_str event['data']['datetime'] = old_timestamp_str event['data']['received'] = int( calendar.timegm(old_timestamp.timetuple())) message = KafkaMessage( TopicPartition('events', 1), 42, json.dumps((0, 'insert', event)).encode('utf-8'), ) assert test_worker.process_message(message) is None
def test_offsets(self): event = self.event message = KafkaMessage( TopicPartition('events', 456), 123, json.dumps((0, 'insert', event)).encode('utf-8') # event doesn't really matter ) replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, FakeConfluentKafkaProducer(), replacement_topic.topic_name, self.metrics) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table) == [(self.event['project_id'], self.event['event_id'], 123, 456)]
def _wrap(self, msg: str) -> KafkaMessage: return KafkaMessage( TopicPartition('replacements', 0), 0, json.dumps(msg).encode('utf-8'), )