Exemplo n.º 1
0
    def test_batch_size(self, broker: Broker[int]) -> None:
        topic = Topic("topic")
        broker.create_topic(topic, partitions=1)
        producer = broker.get_producer()
        for i in [1, 2, 3]:
            producer.produce(topic, i).result()

        consumer = broker.get_consumer("group")

        worker = FakeWorker()
        batching_consumer = StreamProcessor(
            consumer,
            topic,
            BatchProcessingStrategyFactory(
                worker=worker,
                max_batch_size=2,
                max_batch_time=100,
                metrics=DummyMetricsBackend(strict=True),
            ),
        )

        for _ in range(3):
            batching_consumer._run_once()

        batching_consumer._shutdown()

        assert worker.processed == [1, 2, 3]
        assert worker.flushed == [[1, 2]]
        assert consumer.commit_offsets_calls == 1
        assert consumer.close_calls == 1
Exemplo n.º 2
0
    def test_batch_time(self, mock_time: Any, broker: Broker[int]) -> None:
        topic = Topic("topic")
        broker.create_topic(topic, partitions=1)
        producer = broker.get_producer()
        consumer = broker.get_consumer("group")

        worker = FakeWorker()
        metrics = DummyMetricsBackend(strict=True)
        batching_consumer = StreamProcessor(
            consumer,
            topic,
            BatchProcessingStrategyFactory(
                worker=worker,
                max_batch_size=100,
                max_batch_time=2000,
                metrics=metrics,
            ),
            metrics=metrics,
        )

        mock_time.return_value = time.mktime(
            datetime(2018, 1, 1, 0, 0, 0).timetuple())

        for i in [1, 2, 3]:
            producer.produce(topic, i).result()

        for _ in range(3):
            batching_consumer._run_once()

        mock_time.return_value = time.mktime(
            datetime(2018, 1, 1, 0, 0, 1).timetuple())

        for i in [4, 5, 6]:
            producer.produce(topic, i).result()

        for _ in range(3):
            batching_consumer._run_once()

        mock_time.return_value = time.mktime(
            datetime(2018, 1, 1, 0, 0, 5).timetuple())

        for i in [7, 8, 9]:
            producer.produce(topic, i).result()

        for _ in range(3):
            batching_consumer._run_once()

        batching_consumer._shutdown()

        assert worker.processed == [1, 2, 3, 4, 5, 6, 7, 8, 9]
        assert worker.flushed == [[1, 2, 3, 4, 5, 6]]
        assert consumer.commit_offsets_calls == 1
        assert consumer.close_calls == 1
Exemplo n.º 3
0
 def __build_batching_strategy_factory(
     self, ) -> BatchProcessingStrategyFactory[KafkaPayload]:
     return BatchProcessingStrategyFactory(
         worker=ConsumerWorker(
             storage=self.storage,
             producer=self.producer,
             replacements_topic=self.replacements_topic,
             metrics=self.metrics,
         ),
         max_batch_size=self.max_batch_size,
         max_batch_time=self.max_batch_time_ms,
         metrics=self.metrics,
     )
Exemplo n.º 4
0
 def build_snapshot_aware_consumer(
     self, snapshot_id: SnapshotId, transaction_data: TransactionData,
 ) -> StreamProcessor[KafkaPayload]:
     """
     Builds the consumer with a ConsumerWorker able to handle snapshots.
     """
     return self.__build_consumer(
         BatchProcessingStrategyFactory(
             worker=SnapshotAwareWorker(
                 storage=self.storage,
                 producer=self.producer,
                 snapshot_id=snapshot_id,
                 transaction_data=transaction_data,
                 metrics=self.metrics,
                 replacements_topic=self.replacements_topic,
             ),
             max_batch_size=self.max_batch_size,
             max_batch_time=self.max_batch_time_ms,
             metrics=self.metrics,
         )
     )
Exemplo n.º 5
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    assert result_topic is not None

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    if not bootstrap_servers:
        storage = dataset.get_default_entity().get_writable_storage()
        assert storage is not None
        storage_key = storage.get_storage_key().value
        bootstrap_servers = settings.DEFAULT_STORAGE_BROKERS.get(
            storage_key, settings.DEFAULT_BROKERS)

    loader = enforce_table_writer(dataset).get_stream_loader()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(loader.get_commit_log_topic_spec().topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer({
            "bootstrap.servers": ",".join(bootstrap_servers),
            "partitioner": "consistent",
            "message.max.bytes": 50000000,  # 50MB, default is 1MB
        }),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 executor._max_workers)
    metrics.gauge("executor.workers", executor._max_workers)

    with closing(consumer), executor, closing(producer):
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
                metrics,
            ),
            metrics=metrics,
        )

        def handler(signum, frame) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
Exemplo n.º 6
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.replacer import ReplacerWorker
    from snuba.utils.streams import Topic
    from snuba.utils.streams.backends.kafka import (
        KafkaConsumer,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.processing import StreamProcessor
    from snuba.utils.streams.processing.strategies.batching import (
        BatchProcessingStrategyFactory, )

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)
    storage = get_writable_storage(storage_key)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (
        default_replacement_topic_spec is not None
    ), f"Storage {storage.get_storage_key().value} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    replacer = StreamProcessor(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ), ),
        Topic(replacements_topic),
        BatchProcessingStrategyFactory(
            worker=ReplacerWorker(storage, metrics=metrics),
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time_ms,
            metrics=metrics,
        ),
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum: int, frame: Any) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()