Exemplo n.º 1
0
def test_failing_query(
        override_cluster: Callable[[bool], FakeClickhouseCluster]) -> None:
    """
    Test the execution of replacement queries on single node
    when the query fails.
    """
    set_config("write_node_replacements_projects", "[1]")
    override_cluster(False)

    replacer = ReplacerWorker(
        get_writable_storage(StorageKey.ERRORS),
        "consumer_group",
        DummyMetricsBackend(),
    )

    with pytest.raises(ServerExplodedException):
        replacer.flush_batch([
            LegacyReplacement(
                COUNT_QUERY_TEMPLATE,
                INSERT_QUERY_TEMPLATE,
                FINAL_QUERY_TEMPLATE,
                (NEEDS_FINAL, 1),
                REPLACEMENT_TYPE,
                REPLACEMENT_MESSAGE_METADATA,
            )
        ])
Exemplo n.º 2
0
def test_write_each_node(
    override_fixture: Callable[[bool], FakeClickhouseCluster],
    write_node_replacements_projects: str,
    expected_queries: Mapping[str, Sequence[str]],
    request: Any,
) -> None:
    """
    Test the execution of replacement queries on both storage nodes and
    query nodes.
    """
    set_config("write_node_replacements_projects",
               write_node_replacements_projects)
    override_func = request.getfixturevalue(override_fixture)
    test_cluster = override_func(True)

    replacer = ReplacerWorker(
        get_writable_storage(StorageKey.ERRORS),
        "consumer_group",
        DummyMetricsBackend(),
    )

    replacer.flush_batch([
        LegacyReplacement(
            COUNT_QUERY_TEMPLATE,
            INSERT_QUERY_TEMPLATE,
            FINAL_QUERY_TEMPLATE,
            (NEEDS_FINAL, 1),
            REPLACEMENT_TYPE,
            REPLACEMENT_MESSAGE_METADATA,
        )
    ])

    queries = test_cluster.get_queries()
    assert queries == expected_queries
Exemplo n.º 3
0
def test_load_balancing(
        override_cluster: Callable[[bool], FakeClickhouseCluster]) -> None:
    """
    Test running two replacements in a row and verify the queries
    are properly load balanced on different nodes.
    """
    set_config("write_node_replacements_projects", "[1]")
    cluster = override_cluster(True)

    replacer = ReplacerWorker(get_writable_storage(StorageKey.ERRORS),
                              DummyMetricsBackend())
    replacement = LegacyReplacement(
        COUNT_QUERY_TEMPLATE,
        INSERT_QUERY_TEMPLATE,
        FINAL_QUERY_TEMPLATE,
        (NEEDS_FINAL, 1),
    )
    replacer.flush_batch([replacement, replacement])

    assert cluster.get_queries() == {
        "query_node": [
            "SELECT count() FROM errors_dist FINAL WHERE event_id = '6f0ccc03-6efb-4f7c-8005-d0c992106b31'",
            "SELECT count() FROM errors_dist FINAL WHERE event_id = '6f0ccc03-6efb-4f7c-8005-d0c992106b31'",
        ],
        "storage-0-0": [LOCAL_QUERY],
        "storage-0-1": [LOCAL_QUERY],
        "storage-1-0": [LOCAL_QUERY],
        "storage-1-1": [LOCAL_QUERY],
        "storage-2-0": [LOCAL_QUERY],
        "storage-2-1": [LOCAL_QUERY],
    }
Exemplo n.º 4
0
    def eventstream(*, dataset: Dataset) -> RespTuple:
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_default_entity().get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from arroyo.processing.strategies.streaming import (
                KafkaConsumerStrategyFactory,
            )

            from snuba.consumers.consumer import build_batch_writer, process_message

            table_writer = storage.get_table_writer()
            stream_loader = table_writer.get_stream_loader()
            strategy = KafkaConsumerStrategyFactory(
                stream_loader.get_pre_filter(),
                functools.partial(
                    process_message, stream_loader.get_processor(), "consumer_grouup"
                ),
                build_batch_writer(table_writer, metrics=metrics),
                max_batch_size=1,
                max_batch_time=1.0,
                processes=None,
                input_block_size=None,
                output_block_size=None,
            ).create(lambda offsets: None)
            strategy.submit(message)
            strategy.close()
            strategy.join()
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, "consumer_group", metrics=metrics)
            processed = worker.process_message(message)
            if processed is not None:
                batch = [processed]
                worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Exemplo n.º 5
0
    def eventstream(dataset_name):
        dataset = get_dataset(dataset_name)
        ensure_table_exists(dataset)
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message = KafkaMessage(
            TopicPartition('topic', 0),
            0,
            http_request.data,
        )

        type_ = record[1]
        metrics = DummyMetricsBackend()
        if type_ == 'insert':
            from snuba.consumer import ConsumerWorker
            worker = ConsumerWorker(dataset,
                                    producer=None,
                                    replacements_topic=None,
                                    metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker
            worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ('ok', 200, {'Content-Type': 'text/plain'})
Exemplo n.º 6
0
    def eventstream(*, dataset: Dataset):
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from snuba.consumer import ConsumerWorker

            worker = ConsumerWorker(storage, metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Exemplo n.º 7
0
def replacer(replacements_topic, consumer_group, bootstrap_server,
             clickhouse_server, distributed_table_name, max_batch_size,
             max_batch_time_ms, auto_offset_reset, queued_max_messages_kbytes,
             queued_min_messages, log_level, dogstatsd_host, dogstatsd_port):

    import sentry_sdk
    from snuba import util
    from snuba.clickhouse import ClickhousePool
    from batching_kafka_consumer import BatchingKafkaConsumer
    from snuba.replacer import ReplacerWorker

    sentry_sdk.init(dsn=settings.SENTRY_DSN)

    logging.basicConfig(level=getattr(logging, log_level.upper()),
                        format='%(asctime)s %(message)s')
    metrics = util.create_metrics(dogstatsd_host,
                                  dogstatsd_port,
                                  'snuba.replacer',
                                  tags=["group:%s" % consumer_group])

    client_settings = {
        # Replacing existing rows requires reconstructing the entire tuple for each
        # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With
        # the default settings it's common for ClickHouse to go over the default max_memory_usage
        # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the
        # max_memory_usage gives the query more breathing room.
        'max_block_size': settings.REPLACER_MAX_BLOCK_SIZE,
        'max_memory_usage': settings.REPLACER_MAX_MEMORY_USAGE,
        # Don't use up production cache for the count() queries.
        'use_uncompressed_cache': 0,
    }

    clickhouse = ClickhousePool(
        host=clickhouse_server.split(':')[0],
        port=int(clickhouse_server.split(':')[1]),
        client_settings=client_settings,
    )

    replacer = BatchingKafkaConsumer(
        replacements_topic,
        worker=ReplacerWorker(clickhouse,
                              distributed_table_name,
                              metrics=metrics),
        max_batch_size=max_batch_size,
        max_batch_time=max_batch_time_ms,
        metrics=metrics,
        bootstrap_servers=bootstrap_server,
        group_id=consumer_group,
        producer=None,
        commit_log_topic=None,
        auto_offset_reset=auto_offset_reset,
    )

    def handler(signum, frame):
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)

    replacer.run()
Exemplo n.º 8
0
    def eventstream():
        record = json.loads(request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        class Message(object):
            def __init__(self, value):
                self._value = value

            def value(self):
                return self._value

            def partition(self):
                return None

            def offset(self):
                return None

        message = Message(request.data)

        type_ = record[1]
        if type_ == 'insert':
            from snuba.consumer import ConsumerWorker
            worker = ConsumerWorker(clickhouse_rw, settings.CLICKHOUSE_TABLE, producer=None, replacements_topic=None)
        else:
            from snuba.replacer import ReplacerWorker
            worker = ReplacerWorker(clickhouse_rw, settings.CLICKHOUSE_TABLE)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ('ok', 200, {'Content-Type': 'text/plain'})
Exemplo n.º 9
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.replacer import ReplacerWorker
    from snuba.utils.streams import Topic
    from snuba.utils.streams.backends.kafka import (
        KafkaConsumer,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.processing import StreamProcessor
    from snuba.utils.streams.processing.strategies.batching import (
        BatchProcessingStrategyFactory, )

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)
    storage = get_writable_storage(storage_key)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (
        default_replacement_topic_spec is not None
    ), f"Storage {storage.get_storage_key().value} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    replacer = StreamProcessor(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ), ),
        Topic(replacements_topic),
        BatchProcessingStrategyFactory(
            worker=ReplacerWorker(storage, metrics=metrics),
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time_ms,
            metrics=metrics,
        ),
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum: int, frame: Any) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
Exemplo n.º 10
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    dataset_name: Optional[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.clickhouse.native import ClickhousePool
    from snuba.replacer import ReplacerWorker
    from snuba.utils.codecs import PassthroughCodec
    from snuba.utils.streams.batching import BatchingConsumer
    from snuba.utils.streams.kafka import (
        KafkaConsumer,
        KafkaPayload,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.types import Topic

    setup_logging(log_level)
    setup_sentry()

    storage = get_writable_storage(storage_name)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    # If dataset_name is provided, use the writable storage from that dataset.
    # This can be removed once we are passing storage_name instead of
    # dataset_name everywhere
    if dataset_name:
        dataset = get_dataset(dataset_name)
        storage = dataset.get_writable_storage()
        metrics_tags = {"group": consumer_group, "dataset": dataset_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (default_replacement_topic_spec is not None
            ), f"Storage {type(storage)} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    client_settings = {
        # Replacing existing rows requires reconstructing the entire tuple for each
        # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With
        # the default settings it's common for ClickHouse to go over the default max_memory_usage
        # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the
        # max_memory_usage gives the query more breathing room.
        "max_block_size": settings.REPLACER_MAX_BLOCK_SIZE,
        "max_memory_usage": settings.REPLACER_MAX_MEMORY_USAGE,
        # Don't use up production cache for the count() queries.
        "use_uncompressed_cache": 0,
    }

    clickhouse = ClickhousePool(
        settings.CLICKHOUSE_HOST,
        settings.CLICKHOUSE_PORT,
        client_settings=client_settings,
    )

    codec: PassthroughCodec[KafkaPayload] = PassthroughCodec()
    replacer = BatchingConsumer(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ),
            codec=codec,
        ),
        Topic(replacements_topic),
        worker=ReplacerWorker(clickhouse, storage, metrics=metrics),
        max_batch_size=max_batch_size,
        max_batch_time=max_batch_time_ms,
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum, frame) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
Exemplo n.º 11
0
def replacer(*, replacements_topic, consumer_group, bootstrap_server,
             clickhouse_host, clickhouse_port, dataset, max_batch_size,
             max_batch_time_ms, auto_offset_reset, queued_max_messages_kbytes,
             queued_min_messages, log_level, dogstatsd_host, dogstatsd_port):

    import sentry_sdk
    from snuba import util
    from snuba.clickhouse.native import ClickhousePool
    from snuba.replacer import ReplacerWorker
    from snuba.utils.streams.batching import BatchingConsumer
    from snuba.utils.streams.kafka import KafkaConsumer, TransportError, build_kafka_consumer_configuration

    sentry_sdk.init(dsn=settings.SENTRY_DSN)
    dataset = get_dataset(dataset)

    logging.basicConfig(level=getattr(logging, log_level.upper()),
                        format='%(asctime)s %(message)s')

    stream_loader = enforce_table_writer(dataset).get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert default_replacement_topic_spec is not None, f"Dataset {dataset} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = util.create_metrics(dogstatsd_host,
                                  dogstatsd_port,
                                  'snuba.replacer',
                                  tags={"group": consumer_group})

    client_settings = {
        # Replacing existing rows requires reconstructing the entire tuple for each
        # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With
        # the default settings it's common for ClickHouse to go over the default max_memory_usage
        # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the
        # max_memory_usage gives the query more breathing room.
        'max_block_size': settings.REPLACER_MAX_BLOCK_SIZE,
        'max_memory_usage': settings.REPLACER_MAX_MEMORY_USAGE,
        # Don't use up production cache for the count() queries.
        'use_uncompressed_cache': 0,
    }

    clickhouse = ClickhousePool(
        host=clickhouse_host,
        port=clickhouse_port,
        client_settings=client_settings,
    )

    replacer = BatchingConsumer(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ), ),
        replacements_topic,
        worker=ReplacerWorker(clickhouse, dataset, metrics=metrics),
        max_batch_size=max_batch_size,
        max_batch_time=max_batch_time_ms,
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum, frame):
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()