Пример #1
0
    def __init__(self, storage: WritableTableStorage,
                 metrics: MetricsBackend) -> None:
        self.clickhouse = storage.get_cluster().get_query_connection(
            ClickhouseClientSettings.REPLACE)

        self.metrics = metrics
        processor = storage.get_table_writer().get_replacer_processor()
        assert (
            processor
        ), f"This storage writer does not support replacements {storage.get_storage_key().value}"
        self.__replacer_processor = processor
        self.__table_name = (
            storage.get_table_writer().get_schema().get_local_table_name())
Пример #2
0
    def __build_batch_writer(
            self,
            storage: WritableTableStorage) -> ProcessedMessageBatchWriter:
        replacement_batch_writer: Optional[ReplacementBatchWriter]
        stream_loader = storage.get_table_writer().get_stream_loader()
        replacement_topic_spec = stream_loader.get_replacement_topic_spec()
        default_topic_spec = stream_loader.get_default_topic_spec()
        if replacement_topic_spec is not None:
            # XXX: The producer is flushed when closed on strategy teardown
            # after an assignment is revoked, but never explicitly closed.
            # XXX: This assumes that the Kafka cluster used for the input topic
            # to the storage is the same as the replacement topic.
            replacement_batch_writer = ReplacementBatchWriter(
                ConfluentKafkaProducer(
                    build_kafka_producer_configuration(
                        default_topic_spec.topic,
                        override_params={
                            "partitioner": "consistent",
                            "message.max.bytes":
                            50000000,  # 50MB, default is 1MB
                        },
                    )),
                Topic(replacement_topic_spec.topic_name),
            )
        else:
            replacement_batch_writer = None

        return ProcessedMessageBatchWriter(
            InsertBatchWriter(
                storage.get_table_writer().get_batch_writer(
                    self.__metrics,
                    {
                        "load_balancing": "in_order",
                        "insert_distributed_sync": 1
                    },
                ),
                MetricsWrapper(
                    self.__metrics,
                    "insertions",
                    {"storage": storage.get_storage_key().value},
                ),
            ),
            replacement_batch_writer,
        )
Пример #3
0
 def __init__(self, clickhouse: ClickhousePool,
              storage: WritableTableStorage,
              metrics: MetricsBackend) -> None:
     self.clickhouse = clickhouse
     self.metrics = metrics
     processor = storage.get_table_writer().get_replacer_processor()
     assert (
         processor
     ), f"This storage writer does not support replacements {type(storage)}"
     self.__replacer_processor = processor
Пример #4
0
def run_cleanup(
    clickhouse: ClickhousePool,
    storage: WritableTableStorage,
    database: str,
    dry_run: bool = True,
) -> int:

    table = storage.get_table_writer().get_schema().get_local_table_name()

    active_parts = get_active_partitions(clickhouse, storage, database, table)
    stale_parts = filter_stale_partitions(active_parts)
    drop_partitions(clickhouse, database, table, stale_parts, dry_run=dry_run)
    return len(stale_parts)
Пример #5
0
    def __init__(self, storage: WritableTableStorage,
                 metrics: MetricsBackend) -> None:
        self.__storage = storage

        self.metrics = metrics
        processor = storage.get_table_writer().get_replacer_processor()
        assert (
            processor
        ), f"This storage writer does not support replacements {storage.get_storage_key().value}"
        self.__replacer_processor = processor
        self.__database_name = storage.get_cluster().get_database()

        self.__sharded_pool = RoundRobinConnectionPool(
            self.__storage.get_cluster())
Пример #6
0
    def __init__(
        self,
        storage: WritableTableStorage,
        metrics: MetricsBackend,
        producer: Optional[ConfluentKafkaProducer] = None,
        replacements_topic: Optional[Topic] = None,
    ) -> None:
        self.__storage = storage
        self.producer = producer
        self.replacements_topic = replacements_topic
        self.metrics = metrics
        table_writer = storage.get_table_writer()
        self.__writer = BatchWriterEncoderWrapper(
            table_writer.get_batch_writer(metrics, {
                "load_balancing": "in_order",
                "insert_distributed_sync": 1
            }),
            JSONRowEncoder(),
        )

        self.__processor: MessageProcessor
        self.__pre_filter = table_writer.get_stream_loader().get_pre_filter()
Пример #7
0
    def __init__(
        self,
        storage: WritableTableStorage,
        consumer_group: str,
        metrics: MetricsBackend,
    ) -> None:
        self.__storage = storage

        self.metrics = metrics
        processor = storage.get_table_writer().get_replacer_processor()
        assert (
            processor
        ), f"This storage writer does not support replacements {storage.get_storage_key().value}"
        self.__replacer_processor = processor
        self.__database_name = storage.get_cluster().get_database()

        self.__sharded_pool = RoundRobinConnectionPool(
            self.__storage.get_cluster())
        self.__rate_limiter = RateLimiter("replacements")

        self.__last_offset_processed_per_partition: MutableMapping[
            str, int] = dict()
        self.__consumer_group = consumer_group
Пример #8
0
    def __init__(
        self,
        storage: WritableTableStorage,
        metrics: MetricsBackend,
        producer: Optional[ConfluentKafkaProducer] = None,
        replacements_topic: Optional[Topic] = None,
        rapidjson_deserialize: bool = False,
        rapidjson_serialize: bool = False,
    ) -> None:
        self.__storage = storage
        self.producer = producer
        self.replacements_topic = replacements_topic
        self.metrics = metrics
        table_writer = storage.get_table_writer()
        self.__writer = table_writer.get_writer(
            {
                "load_balancing": "in_order",
                "insert_distributed_sync": 1
            },
            rapidjson_serialize=rapidjson_serialize,
        )

        self.__rapidjson_deserialize = rapidjson_deserialize
        self.__pre_filter = table_writer.get_stream_loader().get_pre_filter()