def __init__(self, storage: WritableTableStorage, metrics: MetricsBackend) -> None: self.clickhouse = storage.get_cluster().get_query_connection( ClickhouseClientSettings.REPLACE) self.metrics = metrics processor = storage.get_table_writer().get_replacer_processor() assert ( processor ), f"This storage writer does not support replacements {storage.get_storage_key().value}" self.__replacer_processor = processor self.__table_name = ( storage.get_table_writer().get_schema().get_local_table_name())
def __build_batch_writer( self, storage: WritableTableStorage) -> ProcessedMessageBatchWriter: replacement_batch_writer: Optional[ReplacementBatchWriter] stream_loader = storage.get_table_writer().get_stream_loader() replacement_topic_spec = stream_loader.get_replacement_topic_spec() default_topic_spec = stream_loader.get_default_topic_spec() if replacement_topic_spec is not None: # XXX: The producer is flushed when closed on strategy teardown # after an assignment is revoked, but never explicitly closed. # XXX: This assumes that the Kafka cluster used for the input topic # to the storage is the same as the replacement topic. replacement_batch_writer = ReplacementBatchWriter( ConfluentKafkaProducer( build_kafka_producer_configuration( default_topic_spec.topic, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), Topic(replacement_topic_spec.topic_name), ) else: replacement_batch_writer = None return ProcessedMessageBatchWriter( InsertBatchWriter( storage.get_table_writer().get_batch_writer( self.__metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }, ), MetricsWrapper( self.__metrics, "insertions", {"storage": storage.get_storage_key().value}, ), ), replacement_batch_writer, )
def __init__(self, clickhouse: ClickhousePool, storage: WritableTableStorage, metrics: MetricsBackend) -> None: self.clickhouse = clickhouse self.metrics = metrics processor = storage.get_table_writer().get_replacer_processor() assert ( processor ), f"This storage writer does not support replacements {type(storage)}" self.__replacer_processor = processor
def run_cleanup( clickhouse: ClickhousePool, storage: WritableTableStorage, database: str, dry_run: bool = True, ) -> int: table = storage.get_table_writer().get_schema().get_local_table_name() active_parts = get_active_partitions(clickhouse, storage, database, table) stale_parts = filter_stale_partitions(active_parts) drop_partitions(clickhouse, database, table, stale_parts, dry_run=dry_run) return len(stale_parts)
def __init__(self, storage: WritableTableStorage, metrics: MetricsBackend) -> None: self.__storage = storage self.metrics = metrics processor = storage.get_table_writer().get_replacer_processor() assert ( processor ), f"This storage writer does not support replacements {storage.get_storage_key().value}" self.__replacer_processor = processor self.__database_name = storage.get_cluster().get_database() self.__sharded_pool = RoundRobinConnectionPool( self.__storage.get_cluster())
def __init__( self, storage: WritableTableStorage, metrics: MetricsBackend, producer: Optional[ConfluentKafkaProducer] = None, replacements_topic: Optional[Topic] = None, ) -> None: self.__storage = storage self.producer = producer self.replacements_topic = replacements_topic self.metrics = metrics table_writer = storage.get_table_writer() self.__writer = BatchWriterEncoderWrapper( table_writer.get_batch_writer(metrics, { "load_balancing": "in_order", "insert_distributed_sync": 1 }), JSONRowEncoder(), ) self.__processor: MessageProcessor self.__pre_filter = table_writer.get_stream_loader().get_pre_filter()
def __init__( self, storage: WritableTableStorage, consumer_group: str, metrics: MetricsBackend, ) -> None: self.__storage = storage self.metrics = metrics processor = storage.get_table_writer().get_replacer_processor() assert ( processor ), f"This storage writer does not support replacements {storage.get_storage_key().value}" self.__replacer_processor = processor self.__database_name = storage.get_cluster().get_database() self.__sharded_pool = RoundRobinConnectionPool( self.__storage.get_cluster()) self.__rate_limiter = RateLimiter("replacements") self.__last_offset_processed_per_partition: MutableMapping[ str, int] = dict() self.__consumer_group = consumer_group
def __init__( self, storage: WritableTableStorage, metrics: MetricsBackend, producer: Optional[ConfluentKafkaProducer] = None, replacements_topic: Optional[Topic] = None, rapidjson_deserialize: bool = False, rapidjson_serialize: bool = False, ) -> None: self.__storage = storage self.producer = producer self.replacements_topic = replacements_topic self.metrics = metrics table_writer = storage.get_table_writer() self.__writer = table_writer.get_writer( { "load_balancing": "in_order", "insert_distributed_sync": 1 }, rapidjson_serialize=rapidjson_serialize, ) self.__rapidjson_deserialize = rapidjson_deserialize self.__pre_filter = table_writer.get_stream_loader().get_pre_filter()