def _kafka_producer() -> Producer: global kfk if kfk is None: kfk = Producer( build_kafka_producer_configuration( topic=None, override_params={ # at time of writing (2022-05-09) lz4 was chosen because it # compresses quickly. If more compression is needed at the cost of # performance, zstd can be used instead. Recording the query # is part of the API request, therefore speed is important # perf-testing: https://indico.fnal.gov/event/16264/contributions/36466/attachments/22610/28037/Zstd__LZ4.pdf # by default a topic is configured to use whatever compression method the producer used # https://docs.confluent.io/platform/current/installation/configuration/topic-configs.html#topicconfigs_compression.type "compression.type": "lz4", # the querylog payloads can get really large so we allow larger messages # (double the default) # The performance is not business critical and therefore we accept the tradeoffs # in more bandwidth for more observability/debugability # for this to be meaningful, the following setting has to be at least as large on the broker # message.max.bytes=2000000 "message.max.bytes": 2000000, }, )) return kfk
def produce_policy_creator() -> DeadLetterQueuePolicy: """ Produce all bad messages to dead-letter topic. """ return ProduceInvalidMessagePolicy( KafkaProducer( build_kafka_producer_configuration(Topic.DEAD_LETTER_METRICS)), KafkaTopic(Topic.DEAD_LETTER_METRICS.value), )
def __build_batch_writer( self, storage: WritableTableStorage) -> ProcessedMessageBatchWriter: replacement_batch_writer: Optional[ReplacementBatchWriter] stream_loader = storage.get_table_writer().get_stream_loader() replacement_topic_spec = stream_loader.get_replacement_topic_spec() default_topic_spec = stream_loader.get_default_topic_spec() if replacement_topic_spec is not None: # XXX: The producer is flushed when closed on strategy teardown # after an assignment is revoked, but never explicitly closed. # XXX: This assumes that the Kafka cluster used for the input topic # to the storage is the same as the replacement topic. replacement_batch_writer = ReplacementBatchWriter( ConfluentKafkaProducer( build_kafka_producer_configuration( default_topic_spec.topic, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), Topic(replacement_topic_spec.topic_name), ) else: replacement_batch_writer = None return ProcessedMessageBatchWriter( InsertBatchWriter( storage.get_table_writer().get_batch_writer( self.__metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }, ), MetricsWrapper( self.__metrics, "insertions", {"storage": storage.get_storage_key().value}, ), ), replacement_batch_writer, )
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, ) -> None: DEFAULT_BLOCK_SIZE = int(32 * 1e6) if processes is not None: if input_block_size is None: input_block_size = DEFAULT_BLOCK_SIZE if output_block_size is None: output_block_size = DEFAULT_BLOCK_SIZE setup_logging(log_level) setup_sentry() storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } commit_log_topic: Optional[Topic] if commit_log_topics: commit_log_topic = Topic(commit_log_topics.pop()) else: commit_log_topic = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] kafka_topic = (storages[storage_keys[0]].get_table_writer(). get_stream_loader().get_default_topic_spec().topic) consumer_configuration = build_kafka_consumer_configuration( kafka_topic, consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storages[storage_key].get_table_writer().get_stream_loader(). get_default_topic_spec().topic, consumer_group, )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") if commit_log_topic is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) all storages are # located on the same Kafka cluster (validated above.) commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer( ).get_stream_loader().get_commit_log_topic_spec()) assert commit_log_topic_spec is not None producer = ConfluentKafkaProducer( build_kafka_producer_configuration(commit_log_topic_spec.topic)) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log_topic, ) metrics = MetricsWrapper(environment.metrics, "consumer") configure_metrics(StreamMetricsAdapter(metrics)) processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, ), ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) processor.run()
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) storage = dataset.get_default_entity().get_writable_storage() assert ( storage is not None ), f"Dataset {dataset_name} does not have a writable storage by default." loader = enforce_table_writer(dataset).get_stream_loader() commit_log_topic_spec = loader.get_commit_log_topic_spec() assert commit_log_topic_spec is not None result_topic_spec = loader.get_subscription_result_topic_spec() assert result_topic_spec is not None metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( loader.get_default_topic_spec().topic, consumer_group, auto_offset_reset=auto_offset_reset, bootstrap_servers=bootstrap_servers, ), ), KafkaConsumer( build_kafka_consumer_configuration( commit_log_topic_spec.topic, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", bootstrap_servers=bootstrap_servers, ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(commit_log_topic_spec.topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer( build_kafka_producer_configuration( loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, getattr(executor, "_max_workers", 0)) metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0)) with closing(consumer), executor, closing(producer): from arroyo import configure_metrics configure_metrics(StreamMetricsAdapter(metrics)) batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic) if result_topic is not None else Topic( result_topic_spec.topic_name), metrics, ), max_batch_size, max_batch_time_ms, ), ) def handler(signum: int, frame: Optional[Any]) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def subscriptions_scheduler_executor( *, dataset_name: str, entity_names: Sequence[str], consumer_group: str, followed_consumer_group: str, max_concurrent_queries: int, total_concurrent_queries: int, auto_offset_reset: str, no_strict_offset_reset: bool, schedule_ttl: int, delay_seconds: Optional[int], stale_threshold_seconds: Optional[int], log_level: Optional[str], # TODO: Temporarily overrides the scheduling mode. # Required for single tenant since some partitions may be empty. # To be removed once transactions is no longer semantically partitioned. scheduling_mode: Optional[str], ) -> None: """ Combined subscriptions scheduler and executor. Alternative to the separate scheduler and executor processes. """ setup_logging(log_level) setup_sentry() metrics = MetricsWrapper( environment.metrics, "subscriptions.scheduler_executor", tags={"dataset": dataset_name}, ) configure_metrics(StreamMetricsAdapter(metrics)) # Just get the result topic configuration from the first entity. Later we # check they all have the same result topic anyway before building the consumer. entity_key = EntityKey(entity_names[0]) storage = get_entity(entity_key).get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() result_topic_spec = stream_loader.get_subscription_scheduled_topic_spec() assert result_topic_spec is not None producer = KafkaProducer( build_kafka_producer_configuration( result_topic_spec.topic, override_params={"partitioner": "consistent"}, ) ) processor = build_scheduler_executor_consumer( dataset_name, entity_names, consumer_group, followed_consumer_group, producer, auto_offset_reset, not no_strict_offset_reset, schedule_ttl, delay_seconds, stale_threshold_seconds, max_concurrent_queries, total_concurrent_queries, metrics, SchedulingWatermarkMode(scheduling_mode) if scheduling_mode is not None else None, ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) with closing(producer), flush_querylog(): processor.run()
def confirm_load( *, control_topic: Optional[str], bootstrap_server: Sequence[str], storage_name: str, source: str, log_level: Optional[str] = None, ) -> None: """ Confirms the snapshot has been loaded by sending the snapshot-loaded message on the control topic. """ setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.loaded-snapshot") logger.info( "Sending load completion message for storage %s, from source %s", storage_name, source, ) storage_key = StorageKey(storage_name) storage = get_cdc_storage(storage_key) stream_loader = storage.get_table_writer().get_stream_loader() control_topic = control_topic or storage.get_default_control_topic() snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) descriptor = snapshot_source.get_descriptor() producer = Producer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_server, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) ) msg = SnapshotLoaded( id=descriptor.id, transaction_info=TransactionData( xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list, ), ) json_string = json.dumps(msg.to_dict()) def delivery_callback(error: KafkaError, message: Message) -> None: if error is not None: raise error else: logger.info("Message sent %r", message.value()) producer.produce( control_topic, value=json_string, on_delivery=delivery_callback, ) producer.flush()
def test_scheduler_consumer() -> None: settings.TOPIC_PARTITION_COUNTS = {"events": 2} importlib.reload(scheduler_consumer) admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.COMMIT_LOG]) metrics_backend = TestingMetricsBackend() entity_name = "events" entity = get_entity(EntityKey(entity_name)) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() commit_log_topic = Topic("snuba-commit-log") mock_scheduler_producer = mock.Mock() from snuba.redis import redis_client from snuba.subscriptions.data import PartitionId, SubscriptionData from snuba.subscriptions.entity_subscription import EventsSubscription from snuba.subscriptions.store import RedisSubscriptionDataStore entity_key = EntityKey(entity_name) partition_index = 0 store = RedisSubscriptionDataStore(redis_client, entity_key, PartitionId(partition_index)) store.create( uuid.uuid4(), SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query="MATCH events SELECT count()", entity_subscription=EventsSubscription(data_dict={}), ), ) builder = scheduler_consumer.SchedulerBuilder( entity_name, str(uuid.uuid1().hex), "events", mock_scheduler_producer, "latest", False, 60 * 5, None, None, metrics_backend, ) scheduler = builder.build_consumer() time.sleep(2) scheduler._run_once() scheduler._run_once() scheduler._run_once() epoch = datetime(1970, 1, 1) producer = KafkaProducer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, )) for (partition, offset, orig_message_ts) in [ (0, 0, epoch), (1, 0, epoch + timedelta(minutes=1)), (0, 1, epoch + timedelta(minutes=2)), (1, 1, epoch + timedelta(minutes=3)), ]: fut = producer.produce( commit_log_topic, payload=commit_codec.encode( Commit( "events", Partition(commit_log_topic, partition), offset, orig_message_ts, )), ) fut.result() producer.close() for _ in range(5): scheduler._run_once() scheduler._shutdown() assert mock_scheduler_producer.produce.call_count == 2 settings.TOPIC_PARTITION_COUNTS = {}
def __init__( self, storage_key: StorageKey, raw_topic: Optional[str], replacements_topic: Optional[str], max_batch_size: int, max_batch_time_ms: int, bootstrap_servers: Sequence[str], group_id: str, commit_log_topic: Optional[str], auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, metrics: MetricsBackend, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], commit_retry_policy: Optional[RetryPolicy] = None, profile_path: Optional[str] = None, ) -> None: self.storage = get_writable_storage(storage_key) self.bootstrap_servers = bootstrap_servers topic = (self.storage.get_table_writer().get_stream_loader(). get_default_topic_spec().topic) self.broker_config = get_default_kafka_configuration( topic, bootstrap_servers=bootstrap_servers) self.producer_broker_config = build_kafka_producer_configuration( topic, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) stream_loader = self.storage.get_table_writer().get_stream_loader() self.raw_topic: Topic if raw_topic is not None: self.raw_topic = Topic(raw_topic) else: self.raw_topic = Topic( stream_loader.get_default_topic_spec().topic_name) self.replacements_topic: Optional[Topic] if replacements_topic is not None: self.replacements_topic = Topic(replacements_topic) else: replacement_topic_spec = stream_loader.get_replacement_topic_spec() if replacement_topic_spec is not None: self.replacements_topic = Topic( replacement_topic_spec.topic_name) else: self.replacements_topic = None self.commit_log_topic: Optional[Topic] if commit_log_topic is not None: self.commit_log_topic = Topic(commit_log_topic) else: commit_log_topic_spec = stream_loader.get_commit_log_topic_spec() if commit_log_topic_spec is not None: self.commit_log_topic = Topic(commit_log_topic_spec.topic_name) else: self.commit_log_topic = None # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer(self.producer_broker_config) self.metrics = metrics self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = group_id self.auto_offset_reset = auto_offset_reset self.queued_max_messages_kbytes = queued_max_messages_kbytes self.queued_min_messages = queued_min_messages self.processes = processes self.input_block_size = input_block_size self.output_block_size = output_block_size self.__profile_path = profile_path if commit_retry_policy is None: commit_retry_policy = BasicRetryPolicy( 3, 1, lambda e: isinstance(e, KafkaException) and e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR, KafkaError._WAIT_COORD, ), ) self.__commit_retry_policy = commit_retry_policy
def multistorage_consumer( storage_names: Sequence[str], consumer_group: str, commit_log_topic: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, no_strict_offset_reset: bool, queued_max_messages_kbytes: int, queued_min_messages: int, parallel_collect: bool, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, dead_letter_topic: Optional[str] = None, cooperative_rebalancing: bool = False, ) -> None: DEFAULT_BLOCK_SIZE = int(32 * 1e6) if processes is not None: if input_block_size is None: input_block_size = DEFAULT_BLOCK_SIZE if output_block_size is None: output_block_size = DEFAULT_BLOCK_SIZE setup_logging(log_level) setup_sentry() logger.info("Consumer Starting") storages = { key: get_writable_storage(key) for key in (getattr(StorageKey, name.upper()) for name in storage_names) } topics = { storage.get_table_writer().get_stream_loader().get_default_topic_spec( ).topic_name for storage in storages.values() } # XXX: The ``StreamProcessor`` only supports a single topic at this time, # but is easily modified. The topic routing in the processing strategy is a # bit trickier (but also shouldn't be too bad.) topic = Topic(topics.pop()) if topics: raise ValueError("only one topic is supported") commit_log: Optional[Topic] if commit_log_topic: commit_log = Topic(commit_log_topic) else: # XXX: The ``CommitLogConsumer`` also only supports a single topic at this # time. (It is less easily modified.) This also assumes the commit log # topic is on the same Kafka cluster as the input topic. commit_log_topics = { spec.topic_name for spec in (storage.get_table_writer().get_stream_loader( ).get_commit_log_topic_spec() for storage in storages.values()) if spec is not None } if commit_log_topics: commit_log = Topic(commit_log_topics.pop()) else: commit_log = None if commit_log_topics: raise ValueError("only one commit log topic is supported") # XXX: This requires that all storages are associated with the same Kafka # cluster so that they can be consumed by the same consumer instance. # Unfortunately, we don't have the concept of independently configurable # Kafka clusters in settings, only consumer configurations that are # associated with storages and/or global default configurations. To avoid # implementing yet another method of configuring Kafka clusters, this just # piggybacks on the existing configuration method(s), with the assumption # that most deployments are going to be using the default configuration. storage_keys = [*storages.keys()] kafka_topic = (storages[storage_keys[0]].get_table_writer(). get_stream_loader().get_default_topic_spec().topic) consumer_configuration = build_kafka_consumer_configuration( kafka_topic, consumer_group, auto_offset_reset=auto_offset_reset, strict_offset_reset=not no_strict_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ) if cooperative_rebalancing is True: consumer_configuration[ "partition.assignment.strategy"] = "cooperative-sticky" for storage_key in storage_keys[1:]: if (build_kafka_consumer_configuration( storages[storage_key].get_table_writer().get_stream_loader(). get_default_topic_spec().topic, consumer_group, )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]): raise ValueError( "storages cannot be located on different Kafka clusters") metrics = MetricsWrapper( environment.metrics, "consumer", tags={ "group": consumer_group, "storage": "_".join([storage_keys[0].value, "m"]), }, ) # Collect metrics from librdkafka if we have stats_collection_freq_ms set # for the consumer group, or use the default. stats_collection_frequency_ms = get_config( f"stats_collection_freq_ms_{consumer_group}", get_config("stats_collection_freq_ms", 0), ) if stats_collection_frequency_ms and stats_collection_frequency_ms > 0: def stats_callback(stats_json: str) -> None: stats = rapidjson.loads(stats_json) metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0)) consumer_configuration.update({ "statistics.interval.ms": stats_collection_frequency_ms, "stats_cb": stats_callback, }) if commit_log is None: consumer = KafkaConsumer(consumer_configuration) else: # XXX: This relies on the assumptions that a.) all storages are # located on the same Kafka cluster (validated above.) commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer( ).get_stream_loader().get_commit_log_topic_spec()) assert commit_log_topic_spec is not None producer = ConfluentKafkaProducer( build_kafka_producer_configuration(commit_log_topic_spec.topic)) consumer = KafkaConsumerWithCommitLog( consumer_configuration, producer=producer, commit_log_topic=commit_log, ) dead_letter_producer: Optional[KafkaProducer] = None dead_letter_queue: Optional[Topic] = None if dead_letter_topic: dead_letter_queue = Topic(dead_letter_topic) dead_letter_producer = KafkaProducer( build_kafka_producer_configuration( StreamsTopic(dead_letter_topic))) configure_metrics(StreamMetricsAdapter(metrics)) processor = StreamProcessor( consumer, topic, MultistorageConsumerProcessingStrategyFactory( [*storages.values()], max_batch_size, max_batch_time_ms / 1000.0, parallel_collect=parallel_collect, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, metrics=metrics, producer=dead_letter_producer, topic=dead_letter_queue, ), ) def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) if dead_letter_producer: with closing(dead_letter_producer): processor.run() else: processor.run()
def test_combined_scheduler_and_executor() -> None: state.set_config("subscription_mode_events", "new") create_subscription() epoch = datetime(1970, 1, 1) dataset = get_dataset("events") entity_names = ["events"] num_partitions = 2 max_concurrent_queries = 2 total_concurrent_queries = 2 metrics = TestingMetricsBackend() commit = mock.Mock() partitions = mock.Mock() topic = Topic("snuba-commit-log") partition = Partition(topic, 0) stale_threshold_seconds = None result_topic = "events-subscription-results" schedule_ttl = 60 producer = KafkaProducer( build_kafka_producer_configuration( SnubaTopic.SUBSCRIPTION_RESULTS_EVENTS)) with closing(producer): factory = CombinedSchedulerExecutorFactory( dataset, entity_names, num_partitions, max_concurrent_queries, total_concurrent_queries, producer, metrics, stale_threshold_seconds, result_topic, schedule_ttl, ) strategy = factory.create_with_partitions(commit, partitions) message = Message( partition, 4, Tick( 0, offsets=Interval(1, 3), timestamps=Interval(epoch, epoch + timedelta(seconds=60)), ), epoch, ) strategy.submit(message) # Wait for the query to be executed and the result message produced for i in range(10): time.sleep(0.5) strategy.poll() if commit.call_count == 1: break assert commit.call_count == 1 strategy.close() strategy.join()
def subscriptions_executor( *, dataset_name: str, entity_names: Sequence[str], consumer_group: str, max_concurrent_queries: int, total_concurrent_queries: int, auto_offset_reset: str, no_strict_offset_reset: bool, log_level: Optional[str], stale_threshold_seconds: Optional[int], cooperative_rebalancing: bool, ) -> None: """ The subscription's executor consumes scheduled subscriptions from the scheduled subscription topic for that entity, executes the queries on ClickHouse and publishes results on the results topic. """ setup_logging(log_level) setup_sentry() metrics = MetricsWrapper( environment.metrics, "subscriptions.executor", tags={"dataset": dataset_name}, ) configure_metrics(StreamMetricsAdapter(metrics)) # Just get the result topic configuration from the first entity. Later we # check they all have the same result topic anyway before building the consumer. entity_key = EntityKey(entity_names[0]) storage = get_entity(entity_key).get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() result_topic_spec = stream_loader.get_subscription_result_topic_spec() assert result_topic_spec is not None producer = KafkaProducer( build_kafka_producer_configuration( result_topic_spec.topic, override_params={"partitioner": "consistent"}, )) # TODO: Consider removing and always passing via CLI. # If a value provided via config, it overrides the one provided via CLI. # This is so we can quickly change this in an emergency. stale_threshold_seconds = state.get_config( f"subscriptions_stale_threshold_sec_{dataset_name}", stale_threshold_seconds) processor = build_executor_consumer( dataset_name, entity_names, consumer_group, producer, max_concurrent_queries, total_concurrent_queries, auto_offset_reset, not no_strict_offset_reset, metrics, stale_threshold_seconds, cooperative_rebalancing, ) def handler(signum: int, frame: Any) -> None: # TODO: Temporary code for debugging executor shutdown logger = logging.getLogger() logger.setLevel(logging.DEBUG) processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) with closing(producer), flush_querylog(): processor.run()
def __init__( self, storage_key: StorageKey, kafka_params: KafkaParameters, processing_params: ProcessingParameters, max_batch_size: int, max_batch_time_ms: int, metrics: MetricsBackend, parallel_collect: bool, stats_callback: Optional[Callable[[str], None]] = None, commit_retry_policy: Optional[RetryPolicy] = None, profile_path: Optional[str] = None, mock_parameters: Optional[MockParameters] = None, cooperative_rebalancing: bool = False, ) -> None: self.storage = get_writable_storage(storage_key) self.bootstrap_servers = kafka_params.bootstrap_servers self.consumer_group = kafka_params.group_id topic = (self.storage.get_table_writer().get_stream_loader(). get_default_topic_spec().topic) self.broker_config = get_default_kafka_configuration( topic, bootstrap_servers=kafka_params.bootstrap_servers) logger.info( f"librdkafka log level: {self.broker_config.get('log_level', 6)}") self.producer_broker_config = build_kafka_producer_configuration( topic, bootstrap_servers=kafka_params.bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) stream_loader = self.storage.get_table_writer().get_stream_loader() self.raw_topic: Topic if kafka_params.raw_topic is not None: self.raw_topic = Topic(kafka_params.raw_topic) else: self.raw_topic = Topic( stream_loader.get_default_topic_spec().topic_name) self.replacements_topic: Optional[Topic] if kafka_params.replacements_topic is not None: self.replacements_topic = Topic(kafka_params.replacements_topic) else: replacement_topic_spec = stream_loader.get_replacement_topic_spec() if replacement_topic_spec is not None: self.replacements_topic = Topic( replacement_topic_spec.topic_name) else: self.replacements_topic = None self.commit_log_topic: Optional[Topic] if kafka_params.commit_log_topic is not None: self.commit_log_topic = Topic(kafka_params.commit_log_topic) else: commit_log_topic_spec = stream_loader.get_commit_log_topic_spec() if commit_log_topic_spec is not None: self.commit_log_topic = Topic(commit_log_topic_spec.topic_name) else: self.commit_log_topic = None self.stats_callback = stats_callback # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer(self.producer_broker_config) self.metrics = metrics self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = kafka_params.group_id self.auto_offset_reset = kafka_params.auto_offset_reset self.strict_offset_reset = kafka_params.strict_offset_reset self.queued_max_messages_kbytes = kafka_params.queued_max_messages_kbytes self.queued_min_messages = kafka_params.queued_min_messages self.processes = processing_params.processes self.input_block_size = processing_params.input_block_size self.output_block_size = processing_params.output_block_size self.__profile_path = profile_path self.__mock_parameters = mock_parameters self.__parallel_collect = parallel_collect self.__cooperative_rebalancing = cooperative_rebalancing if commit_retry_policy is None: commit_retry_policy = BasicRetryPolicy( 3, 1, lambda e: isinstance(e, KafkaException) and e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR, KafkaError._WAIT_COORD, ), ) self.__commit_retry_policy = commit_retry_policy
def test_executor_consumer() -> None: """ End to end integration test """ state.set_config("subscription_mode_events", "new") admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_SCHEDULED_EVENTS]) create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_RESULTS_EVENTS]) dataset_name = "events" entity_name = "events" entity_key = EntityKey(entity_name) entity = get_entity(entity_key) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() scheduled_result_topic_spec = stream_loader.get_subscription_result_topic_spec( ) assert scheduled_result_topic_spec is not None result_producer = KafkaProducer( build_kafka_producer_configuration(scheduled_result_topic_spec.topic)) result_consumer = KafkaConsumer( build_kafka_consumer_configuration( scheduled_result_topic_spec.topic, str(uuid.uuid1().hex), auto_offset_reset="latest", strict_offset_reset=False, )) assigned = False def on_partitions_assigned(partitions: Mapping[Partition, int]) -> None: nonlocal assigned assigned = True result_consumer.subscribe( [Topic(scheduled_result_topic_spec.topic_name)], on_assign=on_partitions_assigned, ) attempts = 10 while attempts > 0 and not assigned: result_consumer.poll(1.0) attempts -= 1 # We need to wait for the consumer to receive partitions otherwise, # when we try to consume messages, we will not find anything. # Subscription is an async process. assert assigned == True, "Did not receive assignment within 10 attempts" consumer_group = str(uuid.uuid1().hex) auto_offset_reset = "latest" strict_offset_reset = False executor = build_executor_consumer( dataset_name, [entity_name], consumer_group, result_producer, 2, 2, auto_offset_reset, strict_offset_reset, TestingMetricsBackend(), None, ) for i in range(1, 5): # Give time to the executor to subscribe time.sleep(1) executor._run_once() # Produce a scheduled task to the scheduled subscriptions topic subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count()", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) task = ScheduledSubscriptionTask( timestamp=datetime(1970, 1, 1), task=SubscriptionWithMetadata( entity_key, Subscription( SubscriptionIdentifier( PartitionId(1), uuid.UUID("91b46cb6224f11ecb2ddacde48001122")), subscription_data, ), 1, ), ) encoder = SubscriptionScheduledTaskEncoder() encoded_task = encoder.encode(task) scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec( ) assert scheduled_topic_spec is not None tasks_producer = KafkaProducer( build_kafka_producer_configuration(scheduled_topic_spec.topic)) scheduled_topic = Topic(scheduled_topic_spec.topic_name) tasks_producer.produce(scheduled_topic, payload=encoded_task).result() tasks_producer.close() executor._run_once() executor.signal_shutdown() # Call run here so that the executor shuts down itself cleanly. executor.run() result = result_consumer.poll(5) assert result is not None, "Did not receive a result message" data = json.loads(result.payload.value) assert (data["payload"]["subscription_id"] == "1/91b46cb6224f11ecb2ddacde48001122"), "Invalid subscription id" result_producer.close()
def subscriptions_scheduler( *, entity_name: str, consumer_group: str, followed_consumer_group: str, auto_offset_reset: str, no_strict_offset_reset: bool, schedule_ttl: int, log_level: Optional[str], delay_seconds: Optional[int], stale_threshold_seconds: Optional[int], ) -> None: """ The subscriptions scheduler's job is to schedule subscriptions for a single entity. It consumes the commit log for that entity which is used as a clock and determines which subscriptions to run at each interval. It produces a message for each scheduled subscription task to the scheduled subscription topic for that entity, so it can be picked up and run by subscription executors. The subscriptions scheduler consists of a tick consumer and three processing steps. - The tick consumer consumes the commit log and reads the "orig_message_ts" header. It constructs a new `Tick` message representing the intervals between each of the original messages, which gets passed to the processing strategy. Note: A tick always corresponds to a single partition on the original topic (not the commit log topic as that is never partitioned). - The first processing step is a tick buffer. It buffers ticks where needed and determines when to submit them to the rest of the pipeline. The tick buffer behavior depends on the watermark mode specified by the entity. In PARTITION mode, ticks are never buffered and immediately submitted to the next step. In GLOBAL mode we wait (filling the buffer) until the timestamp of a tick has been reached on every partition before eventually submitting a tick to the next step. This guarantees that a subscription is never scheduled before data on every partition up to that timestamp is written to storage. - The second processing step provides the strategy for committing offsets. Ticks are marked with an `offset_to_commit` if processing that tick allows the committed offset to be advanced. Only the earliest commit log offset that as already been seen by the strategy will get committed. This guarantees at least once scheduling of subscriptions. - The third processing step checks the subscription store to determine which subscriptions need to be scheduled for each tick. Each scheduled subscription task is encoded and produced to the scheduled topic. Offsets are commited if the `should_commit` value provided by the previous strategy is true, and only once all prior scheduled subscriptions were succesfully produced (and replicated). """ setup_logging(log_level) setup_sentry() metrics = MetricsWrapper(environment.metrics, "subscriptions.scheduler", tags={"entity": entity_name}) configure_metrics(StreamMetricsAdapter(metrics)) entity_key = EntityKey(entity_name) storage = get_entity(entity_key).get_writable_storage() assert ( storage is not None ), f"Entity {entity_name} does not have a writable storage by default." if stale_threshold_seconds is not None and delay_seconds is not None: assert (stale_threshold_seconds > delay_seconds ), "stale_threshold_seconds must be greater than delay_seconds" stream_loader = storage.get_table_writer().get_stream_loader() scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec( ) assert scheduled_topic_spec is not None producer = KafkaProducer( build_kafka_producer_configuration( scheduled_topic_spec.topic, override_params={"partitioner": "consistent"}, )) builder = SchedulerBuilder( entity_name, consumer_group, followed_consumer_group, producer, auto_offset_reset, not no_strict_offset_reset, schedule_ttl, delay_seconds, stale_threshold_seconds, metrics, ) processor = builder.build_consumer() def handler(signum: int, frame: Any) -> None: processor.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) with closing(producer): processor.run()