def __build_consumer( self, strategy_factory: ProcessingStrategyFactory[KafkaPayload] ) -> StreamProcessor[KafkaPayload]: storage_key = self.storage.get_storage_key() configuration = build_kafka_consumer_configuration( storage_key, bootstrap_servers=self.bootstrap_servers, group_id=self.group_id, auto_offset_reset=self.auto_offset_reset, queued_max_messages_kbytes=self.queued_max_messages_kbytes, queued_min_messages=self.queued_min_messages, ) if self.commit_log_topic is None: consumer = KafkaConsumer( configuration, commit_retry_policy=self.__commit_retry_policy, ) else: consumer = KafkaConsumerWithCommitLog( configuration, producer=self.producer, commit_log_topic=self.commit_log_topic, commit_retry_policy=self.__commit_retry_policy, ) return StreamProcessor( consumer, self.raw_topic, strategy_factory, metrics=self.metrics, recoverable_errors=[TransportError], )
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba.replacer import ReplacerWorker from snuba.utils.streams import Topic from snuba.utils.streams.backends.kafka import ( KafkaConsumer, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.processing import StreamProcessor from snuba.utils.streams.processing.strategies.batching import ( BatchProcessingStrategyFactory, ) setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) storage = get_writable_storage(storage_key) metrics_tags = {"group": consumer_group, "storage": storage_name} stream_loader = storage.get_table_writer().get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert ( default_replacement_topic_spec is not None ), f"Storage {storage.get_storage_key().value} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = MetricsWrapper( environment.metrics, "replacer", tags=metrics_tags, ) replacer = StreamProcessor( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), ), Topic(replacements_topic), BatchProcessingStrategyFactory( worker=ReplacerWorker(storage, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, ), metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum: int, frame: Any) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) if not bootstrap_servers: storage = dataset.get_default_entity().get_writable_storage() assert storage is not None storage_key = storage.get_storage_key().value bootstrap_servers = settings.DEFAULT_STORAGE_BROKERS.get( storage_key, settings.DEFAULT_BROKERS) loader = enforce_table_writer(dataset).get_stream_loader() metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, consumer_group, auto_offset_reset=auto_offset_reset, ), ), KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer({ "bootstrap.servers": ",".join(bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, executor._max_workers) metrics.gauge("executor.workers", executor._max_workers) with closing(consumer), executor, closing(producer): batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic), metrics, ), max_batch_size, max_batch_time_ms, metrics, ), metrics=metrics, ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: int, input_block_size: int, output_block_size: int, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } commit_log_topic: Optional[Topic] if commit_log_topics: commit_log_topic = Topic(commit_log_topics.pop()) else: commit_log_topic = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] consumer_configuration = build_kafka_consumer_configuration( storage_keys[0], consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storage_key, consumer_group)["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") if commit_log_topic is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) the Kafka cluster where # the commit log topic is located is the same as the input topic (there # is no way to specify otherwise, at writing) and b.) all storages are # located on the same Kafka cluster (validated above.) producer = ConfluentKafkaProducer( build_kafka_producer_configuration(storage_keys[0])) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log_topic, ) metrics = MetricsWrapper(environment.metrics, "consumer") processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, ), metrics=metrics, ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) processor.run()