示例#1
0
def bulk_load(
    *,
    dataset_name: Optional[str],
    dest_table: Optional[str],
    source: Optional[str],
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info("Start bulk load process for dataset %s, from source %s",
                dataset_name, source)
    dataset = get_dataset(dataset_name)

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = enforce_table_writer(dataset).get_bulk_loader(
        snapshot_source, dest_table)
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table),
        settings.BULK_CLICKHOUSE_BUFFER,
    )

    loader.load(writer)
示例#2
0
def optimize(
    *,
    clickhouse_host: Optional[str],
    clickhouse_port: Optional[int],
    storage_name: str,
    parallel: int,
    log_level: Optional[str] = None,
) -> None:
    from datetime import datetime

    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import logger, run_optimize

    setup_logging(log_level)
    setup_sentry()

    storage: ReadableTableStorage

    storage_key = StorageKey(storage_name)
    storage = get_storage(storage_key)

    (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials()

    today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)

    database = storage.get_cluster().get_database()

    # TODO: In distributed mode, optimize currently must be run once for each node
    # with the host and port of that node provided via the CLI. In the future,
    # passing this information won't be necessary, and running this command once
    # will ensure that optimize is performed on all of the individual nodes for
    # that cluster.
    if clickhouse_host and clickhouse_port:
        connection = ClickhousePool(
            clickhouse_host,
            clickhouse_port,
            clickhouse_user,
            clickhouse_password,
            database,
            send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout,
        )
    elif not storage.get_cluster().is_single_node():
        raise click.ClickException("Provide Clickhouse host and port for optimize")
    else:
        connection = storage.get_cluster().get_query_connection(
            ClickhouseClientSettings.OPTIMIZE
        )

    num_dropped = run_optimize(
        connection,
        storage,
        database,
        before=today,
        parallel=parallel,
        clickhouse_host=clickhouse_host,
    )
    logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
示例#3
0
def pytest_configure() -> None:
    """
    Set up the Sentry SDK to avoid errors hidden by configuration.
    Ensure the snuba_test database exists
    """
    assert (
        settings.TESTING
    ), "settings.TESTING is False, try `SNUBA_SETTINGS=test` or `make test`"

    setup_sentry()

    for cluster in CLUSTERS:
        connection = cluster.get_query_connection(
            ClickhouseClientSettings.MIGRATE)
        database_name = cluster.get_database()
        connection.execute(f"DROP DATABASE IF EXISTS {database_name};")
        connection.execute(f"CREATE DATABASE {database_name};")
示例#4
0
def cleanup(
    *,
    clickhouse_host: Optional[str],
    clickhouse_port: Optional[int],
    dry_run: bool,
    storage_name: str,
    log_level: Optional[str] = None,
) -> None:
    """
    Deletes stale partitions for ClickHouse tables
    """

    setup_logging(log_level)
    setup_sentry()

    from snuba.cleanup import logger, run_cleanup
    from snuba.clickhouse.native import ClickhousePool

    storage = get_writable_storage(StorageKey(storage_name))

    (
        clickhouse_user,
        clickhouse_password,
    ) = storage.get_cluster().get_credentials()

    cluster = storage.get_cluster()
    database = cluster.get_database()

    if clickhouse_host and clickhouse_port:
        connection = ClickhousePool(
            clickhouse_host,
            clickhouse_port,
            clickhouse_user,
            clickhouse_password,
            database,
        )
    elif not cluster.is_single_node():
        raise click.ClickException(
            "Provide ClickHouse host and port for cleanup")
    else:
        connection = cluster.get_query_connection(
            ClickhouseClientSettings.CLEANUP)

    num_dropped = run_cleanup(connection, storage, database, dry_run=dry_run)
    logger.info("Dropped %s partitions on %s" % (num_dropped, cluster))
示例#5
0
def pytest_configure() -> None:
    """
    Set up the Sentry SDK to avoid errors hidden by configuration.
    Ensure the snuba_test database exists
    """
    assert (
        settings.TESTING
    ), "settings.TESTING is False, try `SNUBA_SETTINGS=test` or `make test`"

    setup_sentry()

    for cluster in settings.CLUSTERS:
        connection = ClickhousePool(
            cluster["host"], cluster["port"], "default", "", "default",
        )
        database_name = cluster["database"]
        connection.execute(f"DROP DATABASE IF EXISTS {database_name};")
        connection.execute(f"CREATE DATABASE {database_name};")
示例#6
0
def bulk_load(
    *,
    storage_name: str,
    dest_table: str,
    source: str,
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info("Start bulk load process for storage %s, from source %s",
                storage_name, source)

    storage = get_cdc_storage(StorageKey(storage_name))
    table_writer = storage.get_table_writer()

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = table_writer.get_bulk_loader(
        snapshot_source,
        storage.get_postgres_table(),
        dest_table,
        storage.get_row_processor(),
    )
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        table_writer.get_batch_writer(
            environment.metrics,
            table_name=dest_table,
            chunk_size=settings.BULK_CLICKHOUSE_BUFFER,
        ),
        settings.BULK_CLICKHOUSE_BUFFER,
        JSONRowEncoder(),
    )

    loader.load(writer)
示例#7
0
def pytest_configure() -> None:
    """
    Set up the Sentry SDK to avoid errors hidden by configuration.
    Ensure the snuba_test database exists
    """
    setup_sentry()

    # There is only one cluster in test, so fetch the host from there.
    cluster = settings.CLUSTERS[0]

    connection = ClickhousePool(
        cluster["host"],
        cluster["port"],
        "default",
        "",
        "default",
    )

    database_name = cluster["database"]
    connection.execute(f"DROP DATABASE IF EXISTS {database_name};")
    connection.execute(f"CREATE DATABASE {database_name};")
示例#8
0
def pytest_configure() -> None:
    """
    Set up the Sentry SDK to avoid errors hidden by configuration.
    Ensure the snuba_test database exists
    """
    assert (
        settings.TESTING
    ), "settings.TESTING is False, try `SNUBA_SETTINGS=test` or `make test`"

    setup_sentry()

    for cluster in settings.CLUSTERS:
        clickhouse_cluster = ClickhouseCluster(
            host=cluster["host"],
            port=cluster["port"],
            user="******",
            password="",
            database="default",
            http_port=cluster["http_port"],
            storage_sets=cluster["storage_sets"],
            single_node=cluster["single_node"],
            cluster_name=cluster["cluster_name"]
            if "cluster_name" in cluster else None,
            distributed_cluster_name=cluster["distributed_cluster_name"]
            if "distributed_cluster_name" in cluster else None,
        )

        database_name = cluster["database"]
        nodes = [
            *clickhouse_cluster.get_local_nodes(),
            *clickhouse_cluster.get_distributed_nodes(),
        ]

        for node in nodes:
            connection = clickhouse_cluster.get_node_connection(
                ClickhouseClientSettings.MIGRATE, node)
            connection.execute(f"DROP DATABASE IF EXISTS {database_name};")
            connection.execute(f"CREATE DATABASE {database_name};")
示例#9
0
def confirm_load(
    *,
    control_topic: Optional[str],
    bootstrap_server: Sequence[str],
    dataset_name: str,
    source: Optional[str],
    log_level: Optional[str] = None,
) -> None:
    """
    Confirms the snapshot has been loaded by sending the
    snapshot-loaded message on the control topic.
    """

    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.loaded-snapshot")
    logger.info(
        "Sending load completion message for dataset %s, from source %s",
        dataset_name,
        source,
    )

    dataset = get_dataset(dataset_name)

    storage = dataset.get_writable_storage()

    assert isinstance(
        storage, CdcStorage
    ), "Only CDC storages have a control topic thus are supported."

    control_topic = control_topic or storage.get_default_control_topic()

    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    descriptor = snapshot_source.get_descriptor()

    if not bootstrap_server:
        bootstrap_server = settings.DEFAULT_DATASET_BROKERS.get(
            dataset,
            settings.DEFAULT_BROKERS,
        )

    producer = Producer({
        "bootstrap.servers": ",".join(bootstrap_server),
        "partitioner": "consistent",
        "message.max.bytes": 50000000,  # 50MB, default is 1MB
    })

    msg = SnapshotLoaded(
        id=descriptor.id,
        transaction_info=TransactionData(
            xmin=descriptor.xmin,
            xmax=descriptor.xmax,
            xip_list=descriptor.xip_list,
        ),
    )
    json_string = json.dumps(msg.to_dict())

    def delivery_callback(error, message) -> None:
        if error is not None:
            raise error
        else:
            logger.info("Message sent %r", message.value())

    producer.produce(
        control_topic,
        value=json_string,
        on_delivery=delivery_callback,
    )

    producer.flush()
示例#10
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    dataset_name: Optional[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.clickhouse.native import ClickhousePool
    from snuba.replacer import ReplacerWorker
    from snuba.utils.codecs import PassthroughCodec
    from snuba.utils.streams.batching import BatchingConsumer
    from snuba.utils.streams.kafka import (
        KafkaConsumer,
        KafkaPayload,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.types import Topic

    setup_logging(log_level)
    setup_sentry()

    storage = get_writable_storage(storage_name)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    # If dataset_name is provided, use the writable storage from that dataset.
    # This can be removed once we are passing storage_name instead of
    # dataset_name everywhere
    if dataset_name:
        dataset = get_dataset(dataset_name)
        storage = dataset.get_writable_storage()
        metrics_tags = {"group": consumer_group, "dataset": dataset_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (default_replacement_topic_spec is not None
            ), f"Storage {type(storage)} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    client_settings = {
        # Replacing existing rows requires reconstructing the entire tuple for each
        # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With
        # the default settings it's common for ClickHouse to go over the default max_memory_usage
        # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the
        # max_memory_usage gives the query more breathing room.
        "max_block_size": settings.REPLACER_MAX_BLOCK_SIZE,
        "max_memory_usage": settings.REPLACER_MAX_MEMORY_USAGE,
        # Don't use up production cache for the count() queries.
        "use_uncompressed_cache": 0,
    }

    clickhouse = ClickhousePool(
        settings.CLICKHOUSE_HOST,
        settings.CLICKHOUSE_PORT,
        client_settings=client_settings,
    )

    codec: PassthroughCodec[KafkaPayload] = PassthroughCodec()
    replacer = BatchingConsumer(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ),
            codec=codec,
        ),
        Topic(replacements_topic),
        worker=ReplacerWorker(clickhouse, storage, metrics=metrics),
        max_batch_size=max_batch_size,
        max_batch_time=max_batch_time_ms,
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum, frame) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
示例#11
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    assert result_topic is not None

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    if not bootstrap_servers:
        bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get(
            dataset_name, settings.DEFAULT_BROKERS
        )

    loader = enforce_table_writer(dataset).get_stream_loader()

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                ),
                PassthroughCodec(),
            ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                ),
                CommitCodec(),
            ),
            (
                Topic(commit_log_topic)
                if commit_log_topic is not None
                else Topic(loader.get_commit_log_topic_spec().topic_name)
            ),
            set(commit_log_groups),
        )
    )

    producer = KafkaProducer(
        {
            "bootstrap.servers": ",".join(bootstrap_servers),
            "partitioner": "consistent",
            "message.max.bytes": 50000000,  # 50MB, default is 1MB
        },
        SubscriptionResultCodec(),
    )

    with closing(consumer), closing(producer):
        batching_consumer = BatchingConsumer(
            consumer,
            (
                Topic(topic)
                if topic is not None
                else Topic(loader.get_default_topic_spec().topic_name)
            ),
            SubscriptionWorker(
                SubscriptionExecutor(
                    dataset,
                    ThreadPoolExecutor(
                        max_workers=settings.SUBSCRIPTIONS_MAX_CONCURRENT_QUERIES
                    ),
                ),
                {
                    index: SubscriptionScheduler(
                        RedisSubscriptionDataStore(
                            redis_client, dataset, PartitionId(index)
                        ),
                        PartitionId(index),
                        cache_ttl=timedelta(seconds=schedule_ttl),
                    )
                    for index in range(
                        partitions
                        if partitions is not None
                        else loader.get_default_topic_spec().partitions_number
                    )
                },
                producer,
                Topic(result_topic),
            ),
            max_batch_size,
            max_batch_time_ms,
            create_metrics(
                "snuba.subscriptions",
                tags={"group": consumer_group, "dataset": dataset_name},
            ),
        )

        def handler(signum, frame) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
示例#12
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    storage = dataset.get_default_entity().get_writable_storage()
    assert (
        storage is not None
    ), f"Dataset {dataset_name} does not have a writable storage by default."

    loader = enforce_table_writer(dataset).get_stream_loader()
    commit_log_topic_spec = loader.get_commit_log_topic_spec()
    assert commit_log_topic_spec is not None

    result_topic_spec = loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    loader.get_default_topic_spec().topic,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                    bootstrap_servers=bootstrap_servers,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    commit_log_topic_spec.topic,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                    bootstrap_servers=bootstrap_servers,
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(commit_log_topic_spec.topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer(
            build_kafka_producer_configuration(
                loader.get_default_topic_spec().topic,
                bootstrap_servers=bootstrap_servers,
                override_params={
                    "partitioner": "consistent",
                    "message.max.bytes": 50000000,  # 50MB, default is 1MB
                },
            )),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 getattr(executor, "_max_workers", 0))
    metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0))

    with closing(consumer), executor, closing(producer):
        from arroyo import configure_metrics

        configure_metrics(StreamMetricsAdapter(metrics))
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic) if result_topic is not None else Topic(
                        result_topic_spec.topic_name),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
            ),
        )

        def handler(signum: int, frame: Optional[Any]) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
示例#13
0
def confirm_load(
    *,
    control_topic: Optional[str],
    bootstrap_server: Sequence[str],
    storage_name: str,
    source: str,
    log_level: Optional[str] = None,
) -> None:
    """
    Confirms the snapshot has been loaded by sending the
    snapshot-loaded message on the control topic.
    """

    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.loaded-snapshot")
    logger.info(
        "Sending load completion message for storage %s, from source %s",
        storage_name,
        source,
    )

    storage_key = StorageKey(storage_name)
    storage = get_cdc_storage(storage_key)

    stream_loader = storage.get_table_writer().get_stream_loader()

    control_topic = control_topic or storage.get_default_control_topic()

    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT, path=source,
    )

    descriptor = snapshot_source.get_descriptor()

    producer = Producer(
        build_kafka_producer_configuration(
            stream_loader.get_default_topic_spec().topic,
            bootstrap_servers=bootstrap_server,
            override_params={
                "partitioner": "consistent",
                "message.max.bytes": 50000000,  # 50MB, default is 1MB
            },
        )
    )

    msg = SnapshotLoaded(
        id=descriptor.id,
        transaction_info=TransactionData(
            xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list,
        ),
    )
    json_string = json.dumps(msg.to_dict())

    def delivery_callback(error: KafkaError, message: Message) -> None:
        if error is not None:
            raise error
        else:
            logger.info("Message sent %r", message.value())

    producer.produce(
        control_topic, value=json_string, on_delivery=delivery_callback,
    )

    producer.flush()
示例#14
0
def bulk_load(
    *,
    storage_name: str,
    dest_table: Optional[str],
    source: str,
    ignore_existing_data: bool,
    pre_processed: bool,
    show_progress: bool,
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info(
        "Start bulk load process for storage %s, from source %s", storage_name, source
    )

    storage = get_cdc_storage(StorageKey(storage_name))
    table_writer = storage.get_table_writer()

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT, path=source,
    )

    loader = table_writer.get_bulk_loader(
        snapshot_source,
        storage.get_postgres_table(),
        storage.get_row_processor(),
        dest_table,
    )
    # TODO: see whether we need to pass options to the writer

    def progress_callback(bar: progressbar.ProgressBar, progress: int) -> None:
        bar.update(progress)

    if show_progress:
        progress = progressbar.ProgressBar(
            max_value=snapshot_source.get_table_file_size(storage.get_postgres_table())
        )
        progress_func: Optional[ProgressCallback] = partial(progress_callback, progress)
    else:
        progress_func = None

    table_descriptor = snapshot_source.get_descriptor().get_table(
        storage.get_postgres_table()
    )
    if pre_processed:
        writer = table_writer.get_bulk_writer(
            metrics=environment.metrics,
            encoding="gzip" if table_descriptor.zip else None,
            column_names=[c.name for c in table_descriptor.columns or []],
            table_name=dest_table,
        )
        loader.load_preprocessed(
            writer, ignore_existing_data, progress_callback=progress_func
        )
    else:
        buffer_writer = BufferedWriterWrapper(
            table_writer.get_batch_writer(
                environment.metrics,
                table_name=dest_table,
                chunk_size=settings.BULK_CLICKHOUSE_BUFFER,
            ),
            settings.BULK_CLICKHOUSE_BUFFER,
            JSONRowEncoder(),
        )
        loader.load(
            buffer_writer, ignore_existing_data, progress_callback=progress_func
        )
示例#15
0
def subscriptions_executor(
    *,
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    log_level: Optional[str],
    stale_threshold_seconds: Optional[int],
    cooperative_rebalancing: bool,
) -> None:
    """
    The subscription's executor consumes scheduled subscriptions from the scheduled
    subscription topic for that entity, executes the queries on ClickHouse and publishes
    results on the results topic.
    """
    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions.executor",
        tags={"dataset": dataset_name},
    )

    configure_metrics(StreamMetricsAdapter(metrics))

    # Just get the result topic configuration from the first entity. Later we
    # check they all have the same result topic anyway before building the consumer.
    entity_key = EntityKey(entity_names[0])

    storage = get_entity(entity_key).get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()
    result_topic_spec = stream_loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            result_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        ))

    # TODO: Consider removing and always passing via CLI.
    # If a value provided via config, it overrides the one provided via CLI.
    # This is so we can quickly change this in an emergency.
    stale_threshold_seconds = state.get_config(
        f"subscriptions_stale_threshold_sec_{dataset_name}",
        stale_threshold_seconds)

    processor = build_executor_consumer(
        dataset_name,
        entity_names,
        consumer_group,
        producer,
        max_concurrent_queries,
        total_concurrent_queries,
        auto_offset_reset,
        not no_strict_offset_reset,
        metrics,
        stale_threshold_seconds,
        cooperative_rebalancing,
    )

    def handler(signum: int, frame: Any) -> None:
        # TODO: Temporary code for debugging executor shutdown
        logger = logging.getLogger()
        logger.setLevel(logging.DEBUG)

        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer), flush_querylog():
        processor.run()
示例#16
0
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    control_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    stateful_consumer: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    profile_path: Optional[str] = None,
) -> None:

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)

    consumer_builder = ConsumerBuilder(
        storage_key=storage_key,
        raw_topic=raw_events_topic,
        replacements_topic=replacements_topic,
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        bootstrap_servers=bootstrap_server,
        group_id=consumer_group,
        commit_log_topic=commit_log_topic,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
        processes=processes,
        input_block_size=input_block_size,
        output_block_size=output_block_size,
        profile_path=profile_path,
    )

    if stateful_consumer:
        storage = get_cdc_storage(storage_key)
        assert (storage is not None
                ), "Only CDC storages have a control topic thus are supported."
        context = ConsumerStateMachine(
            consumer_builder=consumer_builder,
            topic=control_topic or storage.get_default_control_topic(),
            group_id=consumer_group,
            storage=storage,
        )

        def handler(signum: int, frame: Any) -> None:
            context.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        context.run()
    else:
        consumer = consumer_builder.build_base_consumer()

        def handler(signum: int, frame: Any) -> None:
            consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        consumer.run()
示例#17
0
def test_consumer(
    *,
    commit_log_topic: Optional[str],
    consumer_group: str,
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    avg_latency_ms: int,
    latency_std_deviation_ms: int,
    parallel_collect: bool,
    log_level: Optional[str] = None,
    profile_path: Optional[str] = None,
) -> None:

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)

    metrics = MetricsWrapper(
        environment.metrics,
        "test_consumer",
        tags={
            "group": consumer_group,
            "storage": storage_key.value
        },
    )
    configure_metrics(StreamMetricsAdapter(metrics))

    consumer_builder = ConsumerBuilder(
        storage_key=storage_key,
        kafka_params=KafkaParameters(
            raw_topic=None,
            replacements_topic=None,
            bootstrap_servers=None,
            group_id=consumer_group,
            commit_log_topic=commit_log_topic,
            auto_offset_reset=auto_offset_reset,
            strict_offset_reset=not no_strict_offset_reset,
            queued_max_messages_kbytes=queued_max_messages_kbytes,
            queued_min_messages=queued_min_messages,
        ),
        processing_params=ProcessingParameters(
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
        ),
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        metrics=metrics,
        parallel_collect=parallel_collect,
        profile_path=profile_path,
        mock_parameters=MockParameters(
            avg_write_latency=avg_latency_ms,
            std_deviation=latency_std_deviation_ms,
        ),
    )

    consumer = consumer_builder.build_base_consumer()

    def handler(signum: int, frame: Any) -> None:
        consumer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    consumer.run()
示例#18
0
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    control_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    dataset_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    stateful_consumer: bool,
    rapidjson_deserialize: bool,
    rapidjson_serialize: bool,
    log_level: Optional[str] = None,
) -> None:

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    consumer_builder = ConsumerBuilder(
        dataset_name=dataset_name,
        raw_topic=raw_events_topic,
        replacements_topic=replacements_topic,
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        bootstrap_servers=bootstrap_server,
        group_id=consumer_group,
        commit_log_topic=commit_log_topic,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
        rapidjson_deserialize=rapidjson_deserialize,
        rapidjson_serialize=rapidjson_serialize,
    )

    if stateful_consumer:
        assert isinstance(
            dataset, CdcDataset
        ), "Only CDC dataset have a control topic thus are supported."
        context = ConsumerStateMachine(
            consumer_builder=consumer_builder,
            topic=control_topic or dataset.get_default_control_topic(),
            group_id=consumer_group,
            dataset=dataset,
        )

        def handler(signum, frame) -> None:
            context.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        context.run()
    else:
        consumer = consumer_builder.build_base_consumer()

        def handler(signum, frame) -> None:
            consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        consumer.run()
示例#19
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.replacer import ReplacerWorker
    from snuba.utils.streams import Topic
    from snuba.utils.streams.backends.kafka import (
        KafkaConsumer,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.processing import StreamProcessor
    from snuba.utils.streams.processing.strategies.batching import (
        BatchProcessingStrategyFactory, )

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)
    storage = get_writable_storage(storage_key)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (
        default_replacement_topic_spec is not None
    ), f"Storage {storage.get_storage_key().value} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    replacer = StreamProcessor(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ), ),
        Topic(replacements_topic),
        BatchProcessingStrategyFactory(
            worker=ReplacerWorker(storage, metrics=metrics),
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time_ms,
            metrics=metrics,
        ),
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum: int, frame: Any) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
示例#20
0
文件: wsgi.py 项目: getsentry/snuba
from snuba.environment import setup_logging, setup_sentry

setup_logging()
setup_sentry()

from snuba.admin.views import application  # noqa
def subscriptions_scheduler_executor(
    *,
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    followed_consumer_group: str,
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    schedule_ttl: int,
    delay_seconds: Optional[int],
    stale_threshold_seconds: Optional[int],
    log_level: Optional[str],
    # TODO: Temporarily overrides the scheduling mode.
    # Required for single tenant since some partitions may be empty.
    # To be removed once transactions is no longer semantically partitioned.
    scheduling_mode: Optional[str],
) -> None:
    """
    Combined subscriptions scheduler and executor. Alternative to the separate scheduler and executor processes.
    """
    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions.scheduler_executor",
        tags={"dataset": dataset_name},
    )

    configure_metrics(StreamMetricsAdapter(metrics))

    # Just get the result topic configuration from the first entity. Later we
    # check they all have the same result topic anyway before building the consumer.
    entity_key = EntityKey(entity_names[0])

    storage = get_entity(entity_key).get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()
    result_topic_spec = stream_loader.get_subscription_scheduled_topic_spec()
    assert result_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            result_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        )
    )

    processor = build_scheduler_executor_consumer(
        dataset_name,
        entity_names,
        consumer_group,
        followed_consumer_group,
        producer,
        auto_offset_reset,
        not no_strict_offset_reset,
        schedule_ttl,
        delay_seconds,
        stale_threshold_seconds,
        max_concurrent_queries,
        total_concurrent_queries,
        metrics,
        SchedulingWatermarkMode(scheduling_mode)
        if scheduling_mode is not None
        else None,
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer), flush_querylog():
        processor.run()
示例#22
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    commit_log_topic: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    parallel_collect: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    dead_letter_topic: Optional[str] = None,
    cooperative_rebalancing: bool = False,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    logger.info("Consumer Starting")
    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    commit_log: Optional[Topic]
    if commit_log_topic:
        commit_log = Topic(commit_log_topic)
    else:
        # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
        # time. (It is less easily modified.) This also assumes the commit log
        # topic is on the same Kafka cluster as the input topic.
        commit_log_topics = {
            spec.topic_name
            for spec in (storage.get_table_writer().get_stream_loader(
            ).get_commit_log_topic_spec() for storage in storages.values())
            if spec is not None
        }

        if commit_log_topics:
            commit_log = Topic(commit_log_topics.pop())
        else:
            commit_log = None

        if commit_log_topics:
            raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        strict_offset_reset=not no_strict_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    if cooperative_rebalancing is True:
        consumer_configuration[
            "partition.assignment.strategy"] = "cooperative-sticky"

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    metrics = MetricsWrapper(
        environment.metrics,
        "consumer",
        tags={
            "group": consumer_group,
            "storage": "_".join([storage_keys[0].value, "m"]),
        },
    )
    # Collect metrics from librdkafka if we have stats_collection_freq_ms set
    # for the consumer group, or use the default.
    stats_collection_frequency_ms = get_config(
        f"stats_collection_freq_ms_{consumer_group}",
        get_config("stats_collection_freq_ms", 0),
    )

    if stats_collection_frequency_ms and stats_collection_frequency_ms > 0:

        def stats_callback(stats_json: str) -> None:
            stats = rapidjson.loads(stats_json)
            metrics.gauge("librdkafka.total_queue_size",
                          stats.get("replyq", 0))

        consumer_configuration.update({
            "statistics.interval.ms": stats_collection_frequency_ms,
            "stats_cb": stats_callback,
        })
    if commit_log is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log,
        )

    dead_letter_producer: Optional[KafkaProducer] = None
    dead_letter_queue: Optional[Topic] = None
    if dead_letter_topic:
        dead_letter_queue = Topic(dead_letter_topic)

        dead_letter_producer = KafkaProducer(
            build_kafka_producer_configuration(
                StreamsTopic(dead_letter_topic)))

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            parallel_collect=parallel_collect,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
            producer=dead_letter_producer,
            topic=dead_letter_queue,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)
    if dead_letter_producer:
        with closing(dead_letter_producer):
            processor.run()
    else:
        processor.run()
示例#23
0
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    parallel_collect: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    profile_path: Optional[str] = None,
    cooperative_rebalancing: bool = False,
) -> None:

    setup_logging(log_level)
    setup_sentry()
    logger.info("Consumer Starting")
    storage_key = StorageKey(storage_name)

    metrics = MetricsWrapper(
        environment.metrics,
        "consumer",
        tags={"group": consumer_group, "storage": storage_key.value},
    )
    configure_metrics(StreamMetricsAdapter(metrics))

    def stats_callback(stats_json: str) -> None:
        stats = rapidjson.loads(stats_json)
        metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0))

    consumer_builder = ConsumerBuilder(
        storage_key=storage_key,
        kafka_params=KafkaParameters(
            raw_topic=raw_events_topic,
            replacements_topic=replacements_topic,
            bootstrap_servers=bootstrap_server,
            group_id=consumer_group,
            commit_log_topic=commit_log_topic,
            auto_offset_reset=auto_offset_reset,
            strict_offset_reset=not no_strict_offset_reset,
            queued_max_messages_kbytes=queued_max_messages_kbytes,
            queued_min_messages=queued_min_messages,
        ),
        processing_params=ProcessingParameters(
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
        ),
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        metrics=metrics,
        profile_path=profile_path,
        stats_callback=stats_callback,
        parallel_collect=parallel_collect,
        cooperative_rebalancing=cooperative_rebalancing,
    )

    consumer = consumer_builder.build_base_consumer()

    def handler(signum: int, frame: Any) -> None:
        consumer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    consumer.run()
示例#24
0
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    control_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    dataset_name: Optional[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    stateful_consumer: bool,
    rapidjson_deserialize: bool,
    rapidjson_serialize: bool,
    log_level: Optional[str] = None,
) -> None:

    if not bootstrap_server:
        if dataset_name:
            bootstrap_server = settings.DEFAULT_DATASET_BROKERS.get(
                dataset_name,
                settings.DEFAULT_BROKERS,
            )
        else:
            bootstrap_server = settings.DEFAULT_STORAGE_BROKERS.get(
                storage_name,
                settings.DEFAULT_BROKERS,
            )

    setup_logging(log_level)
    setup_sentry()

    # TODO: Remove this once dataset_name is no longer being passed
    if dataset_name:
        dataset_writable_storage = get_dataset(
            dataset_name).get_writable_storage()
        if not dataset_writable_storage:
            raise click.ClickException(
                f"Dataset {dataset_name} has no writable storage")

        storage_name = {v: k
                        for k, v in WRITABLE_STORAGES.items()
                        }[dataset_writable_storage]

    consumer_builder = ConsumerBuilder(
        storage_name=storage_name,
        raw_topic=raw_events_topic,
        replacements_topic=replacements_topic,
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        bootstrap_servers=bootstrap_server,
        group_id=consumer_group,
        commit_log_topic=commit_log_topic,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
        rapidjson_deserialize=rapidjson_deserialize,
        rapidjson_serialize=rapidjson_serialize,
    )

    if stateful_consumer:
        storage = get_cdc_storage(storage_name)
        assert storage is not None, "Only CDC storages have a control topic thus are supported."
        context = ConsumerStateMachine(
            consumer_builder=consumer_builder,
            topic=control_topic or storage.get_default_control_topic(),
            group_id=consumer_group,
            storage=storage,
        )

        def handler(signum, frame) -> None:
            context.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        context.run()
    else:
        consumer = consumer_builder.build_base_consumer()

        def handler(signum, frame) -> None:
            consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        consumer.run()
示例#25
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
    # time. (It is less easily modified.) This also assumes the commit log
    # topic is on the same Kafka cluster as the input topic.
    commit_log_topics = {
        spec.topic_name
        for spec in (storage.get_table_writer().get_stream_loader(
        ).get_commit_log_topic_spec() for storage in storages.values())
        if spec is not None
    }

    commit_log_topic: Optional[Topic]
    if commit_log_topics:
        commit_log_topic = Topic(commit_log_topics.pop())
    else:
        commit_log_topic = None

    if commit_log_topics:
        raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    if commit_log_topic is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log_topic,
        )

    metrics = MetricsWrapper(environment.metrics, "consumer")

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    processor.run()
示例#26
0
def subscriptions_scheduler(
    *,
    entity_name: str,
    consumer_group: str,
    followed_consumer_group: str,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    schedule_ttl: int,
    log_level: Optional[str],
    delay_seconds: Optional[int],
    stale_threshold_seconds: Optional[int],
) -> None:
    """
    The subscriptions scheduler's job is to schedule subscriptions for a single entity.
    It consumes the commit log for that entity which is used as a clock and determines
    which subscriptions to run at each interval. It produces a message for each
    scheduled subscription task to the scheduled subscription topic for that entity, so
    it can be picked up and run by subscription executors.

    The subscriptions scheduler consists of a tick consumer and three processing steps.

    - The tick consumer consumes the commit log and reads the "orig_message_ts" header.
    It constructs a new `Tick` message representing the intervals between each of the
    original messages, which gets passed to the processing strategy. Note: A tick always
    corresponds to a single partition on the original topic (not the commit log topic
    as that is never partitioned).

    - The first processing step is a tick buffer. It buffers ticks where needed and
    determines when to submit them to the rest of the pipeline. The tick buffer behavior
    depends on the watermark mode specified by the entity. In PARTITION mode, ticks are
    never buffered and immediately submitted to the next step. In GLOBAL mode we wait
    (filling the buffer) until the timestamp of a tick has been reached on every
    partition before eventually submitting a tick to the next step. This guarantees that
    a subscription is never scheduled before data on every partition up to that
    timestamp is written to storage.

    - The second processing step provides the strategy for committing offsets. Ticks are
    marked with an `offset_to_commit` if processing that tick allows the committed
    offset to be advanced. Only the earliest commit log offset that as already been seen
    by the strategy will get committed. This guarantees at least once scheduling of
    subscriptions.

    - The third processing step checks the subscription store to determine which
    subscriptions need to be scheduled for each tick. Each scheduled subscription task
    is encoded and produced to the scheduled topic. Offsets are commited if the
    `should_commit` value provided by the previous strategy is true, and only once all
    prior scheduled subscriptions were succesfully produced (and replicated).
    """

    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(environment.metrics,
                             "subscriptions.scheduler",
                             tags={"entity": entity_name})

    configure_metrics(StreamMetricsAdapter(metrics))

    entity_key = EntityKey(entity_name)

    storage = get_entity(entity_key).get_writable_storage()

    assert (
        storage is not None
    ), f"Entity {entity_name} does not have a writable storage by default."

    if stale_threshold_seconds is not None and delay_seconds is not None:
        assert (stale_threshold_seconds > delay_seconds
                ), "stale_threshold_seconds must be greater than delay_seconds"

    stream_loader = storage.get_table_writer().get_stream_loader()

    scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec(
    )
    assert scheduled_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            scheduled_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        ))

    builder = SchedulerBuilder(
        entity_name,
        consumer_group,
        followed_consumer_group,
        producer,
        auto_offset_reset,
        not no_strict_offset_reset,
        schedule_ttl,
        delay_seconds,
        stale_threshold_seconds,
        metrics,
    )

    processor = builder.build_consumer()

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer):
        processor.run()