def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, producer, replacement_topic.topic_name, self.metrics) test_worker.flush_batch([ ProcessedMessage( action=ProcessorAction.REPLACE, data=[('1', { 'project_id': 1 })], ), ProcessedMessage( action=ProcessorAction.REPLACE, data=[('2', { 'project_id': 2 })], ), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == \ [('event-replacements', b'1', b'{"project_id": 1}'), ('event-replacements', b'2', b'{"project_id": 2}')]
def test_offsets(self): event = self.event message: Message[KafkaPayload] = Message( Partition(Topic("events"), 456), 123, KafkaPayload( None, json.dumps((0, "insert", event)).encode("utf-8") ), # event doesn't really matter datetime.now(), ) test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table ) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
def test_skip_too_old(self): test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event["datetime"] = old_timestamp_str event["data"]["datetime"] = old_timestamp_str event["data"]["received"] = int(calendar.timegm(old_timestamp.timetuple())) message: Message[KafkaPayload] = Message( Partition(Topic("events"), 1), 42, KafkaPayload(None, json.dumps((0, "insert", event)).encode("utf-8")), datetime.now(), ) assert test_worker.process_message(message) is None
def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() test_worker = ConsumerWorker( self.dataset, producer=producer, replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) test_worker.flush_batch( [ ProcessedMessage( action=ProcessorAction.REPLACE, data=[("1", {"project_id": 1})], ), ProcessedMessage( action=ProcessorAction.REPLACE, data=[("2", {"project_id": 2})], ), ] ) assert [(m._topic, m._key, m._value) for m in producer.messages] == [ ("event-replacements", b"1", b'{"project_id": 1}'), ("event-replacements", b"2", b'{"project_id": 2}'), ]
def test_produce_replacement_messages(self): topic = 'topic' producer = FakeKafkaProducer() test_worker = ConsumerWorker(self.clickhouse, self.table, producer, topic) test_worker.flush_batch([ (processor.REPLACE, ('1', {'project_id': 1})), (processor.REPLACE, ('2', {'project_id': 2})), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == \ [('topic', b'1', b'{"project_id": 1}'), ('topic', b'2', b'{"project_id": 2}')]
def test_skip_too_old(self): test_worker = ConsumerWorker(self.clickhouse, self.table, FakeKafkaProducer(), 'topic') event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event['datetime'] = old_timestamp_str event['data']['datetime'] = old_timestamp_str event['data']['received'] = int(calendar.timegm(old_timestamp.timetuple())) class FakeMessage(object): def value(self): return json.dumps((0, 'insert', event)) assert test_worker.process_message(FakeMessage()) is None
def test_produce_replacement_messages(self): producer = FakeKafkaProducer() test_worker = ConsumerWorker( self.dataset, producer, self.dataset.get_default_replacement_topic()) test_worker.flush_batch([ (self.dataset.get_processor().REPLACE, ('1', { 'project_id': 1 })), (self.dataset.get_processor().REPLACE, ('2', { 'project_id': 2 })), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == \ [('event-replacements', b'1', b'{"project_id": 1}'), ('event-replacements', b'2', b'{"project_id": 2}')]
def run(events_file, clickhouse, table_name, repeat=1, profile_process=False, profile_write=False): from snuba.clickhouse import get_table_definition, get_test_engine from snuba.consumer import ConsumerWorker clickhouse.execute( get_table_definition( name=table_name, engine=get_test_engine(), ) ) consumer = ConsumerWorker( clickhouse=clickhouse, dist_table_name=table_name, producer=None, replacements_topic=None, ) messages = get_messages(events_file) messages = chain(*([messages] * repeat)) processed = [] def process(): with settings_override({'DISCARD_OLD_EVENTS': False}): for message in messages: result = consumer.process_message(message) if result is not None: processed.append(result) def write(): consumer.flush_batch(processed) time_start = time.time() if profile_process: cProfile.runctx('process()', globals(), locals(), sort='cumulative') else: process() time_write = time.time() if profile_write: cProfile.runctx('write()', globals(), locals(), sort='cumulative') else: write() time_finish = time.time() format_time = lambda t: ("%.2f" % t).rjust(10, ' ') time_to_process = (time_write - time_start) * 1000 time_to_write = (time_finish - time_write) * 1000 time_total = (time_finish - time_start) * 1000 num_events = len(processed) logger.info("Number of events: %s" % six.text_type(num_events).rjust(10, ' ')) logger.info("Total: %sms" % format_time(time_total)) logger.info("Total process: %sms" % format_time(time_to_process)) logger.info("Total write: %sms" % format_time(time_to_write)) logger.info("Process event: %sms/ea" % format_time(time_to_process / num_events)) logger.info("Write event: %sms/ea" % format_time(time_to_write / num_events))
def build_base_consumer(self) -> BatchingConsumer: """ Builds the consumer with a ConsumerWorker. """ return self.__build_consumer( ConsumerWorker(self.dataset, producer=self.producer, replacements_topic=self.replacements_topic, metrics=self.metrics))
def consumer(raw_events_topic, replacements_topic, commit_log_topic, consumer_group, bootstrap_server, clickhouse_server, distributed_table_name, max_batch_size, max_batch_time_ms, auto_offset_reset, queued_max_messages_kbytes, queued_min_messages, log_level, dogstatsd_host, dogstatsd_port): import sentry_sdk from snuba import util from snuba.clickhouse import ClickhousePool from batching_kafka_consumer import BatchingKafkaConsumer from snuba.consumer import ConsumerWorker sentry_sdk.init(dsn=settings.SENTRY_DSN) logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') metrics = util.create_metrics( dogstatsd_host, dogstatsd_port, 'snuba.consumer', tags=["group:%s" % consumer_group] ) clickhouse = ClickhousePool( host=clickhouse_server.split(':')[0], port=int(clickhouse_server.split(':')[1]), client_settings={ 'load_balancing': 'in_order', 'insert_distributed_sync': True, }, metrics=metrics ) producer = Producer({ 'bootstrap.servers': ','.join(bootstrap_server), 'partitioner': 'consistent', 'message.max.bytes': 50000000, # 50MB, default is 1MB }) consumer = BatchingKafkaConsumer( raw_events_topic, worker=ConsumerWorker( clickhouse, distributed_table_name, producer=producer, replacements_topic=replacements_topic, metrics=metrics ), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, bootstrap_servers=bootstrap_server, group_id=consumer_group, producer=producer, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, ) def handler(signum, frame): consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) consumer.run()
def test_offsets(self): event = self.event class FakeMessage(object): def value(self): # event doesn't really matter return json.dumps((0, 'insert', event)) def offset(self): return 123 def partition(self): return 456 test_worker = ConsumerWorker(self.clickhouse, self.table, FakeKafkaProducer(), 'topic') batch = [test_worker.process_message(FakeMessage())] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table ) == [(self.event['project_id'], self.event['event_id'], 123, 456)]
def eventstream(dataset_name): dataset = get_dataset(dataset_name) ensure_table_exists(dataset) record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message = KafkaMessage( TopicPartition('topic', 0), 0, http_request.data, ) type_ = record[1] metrics = DummyMetricsBackend() if type_ == 'insert': from snuba.consumer import ConsumerWorker worker = ConsumerWorker(dataset, producer=None, replacements_topic=None, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ('ok', 200, {'Content-Type': 'text/plain'})
def eventstream(*, dataset: Dataset): record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_writable_storage() assert storage is not None if type_ == "insert": from snuba.consumer import ConsumerWorker worker = ConsumerWorker(storage, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def __build_batching_strategy_factory( self, ) -> BatchProcessingStrategyFactory[KafkaPayload]: return BatchProcessingStrategyFactory( worker=ConsumerWorker( storage=self.storage, producer=self.producer, replacements_topic=self.replacements_topic, metrics=self.metrics, ), max_batch_size=self.max_batch_size, max_batch_time=self.max_batch_time_ms, metrics=self.metrics, )
def test_offsets(self): event = self.event message = KafkaMessage( TopicPartition('events', 456), 123, json.dumps((0, 'insert', event)).encode('utf-8') # event doesn't really matter ) replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, FakeConfluentKafkaProducer(), replacement_topic.topic_name, self.metrics) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table) == [(self.event['project_id'], self.event['event_id'], 123, 456)]
def test_skip_too_old(self): replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, FakeConfluentKafkaProducer(), replacement_topic.topic_name, self.metrics) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event['datetime'] = old_timestamp_str event['data']['datetime'] = old_timestamp_str event['data']['received'] = int( calendar.timegm(old_timestamp.timetuple())) message = KafkaMessage( TopicPartition('events', 1), 42, json.dumps((0, 'insert', event)).encode('utf-8'), ) assert test_worker.process_message(message) is None
def build_base_consumer(self) -> BatchingConsumer[KafkaPayload]: """ Builds the consumer with a ConsumerWorker. """ return self.__build_consumer( ConsumerWorker( self.dataset, producer=self.producer, replacements_topic=self.replacements_topic, metrics=self.metrics, rapidjson_deserialize=self.__rapidjson_deserialize, rapidjson_serialize=self.__rapidjson_serialize, ) )
def test_skip_too_old(self): test_worker = ConsumerWorker( self.dataset, FakeKafkaProducer(), self.dataset.get_default_replacement_topic()) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event['datetime'] = old_timestamp_str event['data']['datetime'] = old_timestamp_str event['data']['received'] = int( calendar.timegm(old_timestamp.timetuple())) class FakeMessage(object): def value(self): return json.dumps((0, 'insert', event)) def partition(self): return 1 def offset(self): return 42 assert test_worker.process_message(FakeMessage()) is None
def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() test_worker = ConsumerWorker( self.dataset.get_writable_storage(), producer=producer, replacements_topic=Topic( enforce_table_writer(self.dataset).get_stream_loader(). get_replacement_topic_spec().topic_name), metrics=self.metrics, ) test_worker.flush_batch([ ReplacementBatch("1", [{ "project_id": 1 }]), ReplacementBatch("2", [{ "project_id": 2 }]), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == [ ("event-replacements", b"1", b'{"project_id":1}'), ("event-replacements", b"2", b'{"project_id":2}'), ]
def eventstream(): record = json.loads(request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) class Message(object): def __init__(self, value): self._value = value def value(self): return self._value def partition(self): return None def offset(self): return None message = Message(request.data) type_ = record[1] if type_ == 'insert': from snuba.consumer import ConsumerWorker worker = ConsumerWorker(clickhouse_rw, settings.CLICKHOUSE_TABLE, producer=None, replacements_topic=None) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(clickhouse_rw, settings.CLICKHOUSE_TABLE) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ('ok', 200, {'Content-Type': 'text/plain'})
def run(events_file, dataset, repeat=1, profile_process=False, profile_write=False): """ Measures the write performance of a dataset """ from snuba.consumer import ConsumerWorker for storage in dataset.get_all_storages(): for statement in storage.get_schemas().get_create_statements(): clickhouse_rw.execute(statement.statement) writable_storage = dataset.get_writable_storage() consumer = ConsumerWorker(writable_storage, metrics=DummyMetricsBackend()) messages = get_messages(events_file) messages = chain(*([messages] * repeat)) processed = [] def process(): with settings_override({"DISCARD_OLD_EVENTS": False}): for message in messages: result = consumer.process_message(message) if result is not None: processed.append(result) def write(): consumer.flush_batch(processed) time_start = time.time() if profile_process: filename = tempfile.NamedTemporaryFile( prefix=os.path.basename(events_file) + ".process.", suffix=".pstats", delete=False, ).name cProfile.runctx("process()", globals(), locals(), filename=filename) logger.info("Profile Data: %s", filename) else: process() time_write = time.time() if profile_write: filename = tempfile.NamedTemporaryFile( prefix=os.path.basename(events_file) + ".write.", suffix=".pstats", delete=False, ).name cProfile.runctx("write()", globals(), locals(), filename=filename) logger.info("Profile Data: %s", filename) else: write() time_finish = time.time() time_to_process = (time_write - time_start) * 1000 time_to_write = (time_finish - time_write) * 1000 time_total = (time_finish - time_start) * 1000 num_events = len(processed) logger.info("Number of events: %s" % str(num_events).rjust(10, " ")) logger.info("Total: %sms" % format_time(time_total)) logger.info("Total process: %sms" % format_time(time_to_process)) logger.info("Total write: %sms" % format_time(time_to_write)) logger.info("Process event: %sms/ea" % format_time(time_to_process / num_events)) logger.info("Write event: %sms/ea" % format_time(time_to_write / num_events))
def run(events_file, dataset, repeat=1, profile_process=False, profile_write=False): """ Measures the write performance of a dataset """ from snuba.consumer import ConsumerWorker from snuba.clickhouse.native import ClickhousePool for statement in dataset.get_dataset_schemas().get_create_statements(): ClickhousePool().execute(statement) consumer = ConsumerWorker( dataset=dataset, producer=None, replacements_topic=None, ) messages = get_messages(events_file) messages = chain(*([messages] * repeat)) processed = [] def process(): with settings_override({'DISCARD_OLD_EVENTS': False}): for message in messages: result = consumer.process_message(message) if result is not None: processed.append(result) def write(): consumer.flush_batch(processed) time_start = time.time() if profile_process: filename = tempfile.NamedTemporaryFile( prefix=os.path.basename(events_file) + '.process.', suffix='.pstats', delete=False, ).name cProfile.runctx('process()', globals(), locals(), filename=filename) logger.info('Profile Data: %s', filename) else: process() time_write = time.time() if profile_write: filename = tempfile.NamedTemporaryFile( prefix=os.path.basename(events_file) + '.write.', suffix='.pstats', delete=False, ).name cProfile.runctx('write()', globals(), locals(), filename=filename) logger.info('Profile Data: %s', filename) else: write() time_finish = time.time() format_time = lambda t: ("%.2f" % t).rjust(10, ' ') time_to_process = (time_write - time_start) * 1000 time_to_write = (time_finish - time_write) * 1000 time_total = (time_finish - time_start) * 1000 num_events = len(processed) logger.info("Number of events: %s" % str(num_events).rjust(10, ' ')) logger.info("Total: %sms" % format_time(time_total)) logger.info("Total process: %sms" % format_time(time_to_process)) logger.info("Total write: %sms" % format_time(time_to_write)) logger.info("Process event: %sms/ea" % format_time(time_to_process / num_events)) logger.info("Write event: %sms/ea" % format_time(time_to_write / num_events))