def __build_consumer(self, worker: ConsumerWorker) -> BatchingConsumer: configuration = build_kafka_consumer_configuration( bootstrap_servers=self.bootstrap_servers, group_id=self.group_id, auto_offset_reset=self.auto_offset_reset, queued_max_messages_kbytes=self.queued_max_messages_kbytes, queued_min_messages=self.queued_min_messages, ) if self.commit_log_topic is None: consumer = KafkaConsumer(configuration) else: consumer = KafkaConsumerWithCommitLog( configuration, self.producer, self.commit_log_topic, ) return BatchingConsumer( consumer, self.raw_topic, worker=worker, max_batch_size=self.max_batch_size, max_batch_time=self.max_batch_time_ms, metrics=self.metrics, recoverable_errors=[TransportError], )
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) if not bootstrap_servers: bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS ) loader = enforce_table_writer(dataset).get_stream_loader() consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, consumer_group, auto_offset_reset=auto_offset_reset, ), PassthroughCodec(), ), KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", ), CommitCodec(), ), ( Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name) ), set(commit_log_groups), ) ) producer = KafkaProducer( { "bootstrap.servers": ",".join(bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, SubscriptionResultCodec(), ) with closing(consumer), closing(producer): batching_consumer = BatchingConsumer( consumer, ( Topic(topic) if topic is not None else Topic(loader.get_default_topic_spec().topic_name) ), SubscriptionWorker( SubscriptionExecutor( dataset, ThreadPoolExecutor( max_workers=settings.SUBSCRIPTIONS_MAX_CONCURRENT_QUERIES ), ), { index: SubscriptionScheduler( RedisSubscriptionDataStore( redis_client, dataset, PartitionId(index) ), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), ) for index in range( partitions if partitions is not None else loader.get_default_topic_spec().partitions_number ) }, producer, Topic(result_topic), ), max_batch_size, max_batch_time_ms, create_metrics( "snuba.subscriptions", tags={"group": consumer_group, "dataset": dataset_name}, ), ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], dataset_name: Optional[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba.clickhouse.native import ClickhousePool from snuba.replacer import ReplacerWorker from snuba.utils.codecs import PassthroughCodec from snuba.utils.streams.batching import BatchingConsumer from snuba.utils.streams.kafka import ( KafkaConsumer, KafkaPayload, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.types import Topic setup_logging(log_level) setup_sentry() storage = get_writable_storage(storage_name) metrics_tags = {"group": consumer_group, "storage": storage_name} # If dataset_name is provided, use the writable storage from that dataset. # This can be removed once we are passing storage_name instead of # dataset_name everywhere if dataset_name: dataset = get_dataset(dataset_name) storage = dataset.get_writable_storage() metrics_tags = {"group": consumer_group, "dataset": dataset_name} stream_loader = storage.get_table_writer().get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert (default_replacement_topic_spec is not None ), f"Storage {type(storage)} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = MetricsWrapper( environment.metrics, "replacer", tags=metrics_tags, ) client_settings = { # Replacing existing rows requires reconstructing the entire tuple for each # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With # the default settings it's common for ClickHouse to go over the default max_memory_usage # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the # max_memory_usage gives the query more breathing room. "max_block_size": settings.REPLACER_MAX_BLOCK_SIZE, "max_memory_usage": settings.REPLACER_MAX_MEMORY_USAGE, # Don't use up production cache for the count() queries. "use_uncompressed_cache": 0, } clickhouse = ClickhousePool( settings.CLICKHOUSE_HOST, settings.CLICKHOUSE_PORT, client_settings=client_settings, ) codec: PassthroughCodec[KafkaPayload] = PassthroughCodec() replacer = BatchingConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), codec=codec, ), Topic(replacements_topic), worker=ReplacerWorker(clickhouse, storage, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum, frame) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
def replacer(*, replacements_topic, consumer_group, bootstrap_server, clickhouse_host, clickhouse_port, dataset, max_batch_size, max_batch_time_ms, auto_offset_reset, queued_max_messages_kbytes, queued_min_messages, log_level, dogstatsd_host, dogstatsd_port): import sentry_sdk from snuba import util from snuba.clickhouse.native import ClickhousePool from snuba.replacer import ReplacerWorker from snuba.utils.streams.batching import BatchingConsumer from snuba.utils.streams.kafka import KafkaConsumer, TransportError, build_kafka_consumer_configuration sentry_sdk.init(dsn=settings.SENTRY_DSN) dataset = get_dataset(dataset) logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') stream_loader = enforce_table_writer(dataset).get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert default_replacement_topic_spec is not None, f"Dataset {dataset} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = util.create_metrics(dogstatsd_host, dogstatsd_port, 'snuba.replacer', tags={"group": consumer_group}) client_settings = { # Replacing existing rows requires reconstructing the entire tuple for each # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With # the default settings it's common for ClickHouse to go over the default max_memory_usage # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the # max_memory_usage gives the query more breathing room. 'max_block_size': settings.REPLACER_MAX_BLOCK_SIZE, 'max_memory_usage': settings.REPLACER_MAX_MEMORY_USAGE, # Don't use up production cache for the count() queries. 'use_uncompressed_cache': 0, } clickhouse = ClickhousePool( host=clickhouse_host, port=clickhouse_port, client_settings=client_settings, ) replacer = BatchingConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), ), replacements_topic, worker=ReplacerWorker(clickhouse, dataset, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum, frame): replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()