def __init__(self) -> None: storage = get_cdc_storage("groupassignees") schema = storage.get_table_writer().get_schema() super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage), abstract_column_set=schema.get_columns(), writable_storage=storage, )
def __init__(self) -> None: storage = get_cdc_storage(StorageKey.GROUPEDMESSAGES) schema = storage.get_table_writer().get_schema() super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage), abstract_column_set=schema.get_columns(), writable_storage=storage, )
def __init__(self) -> None: storage = get_cdc_storage(StorageKey.GROUPEDMESSAGES) schema = storage.get_table_writer().get_schema() super().__init__( storages=[storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=storage), ), abstract_column_set=schema.get_columns(), join_relationships={}, writable_storage=storage, )
def test_empty_topic(self, create_consumer) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer bootstrap = BootstrapState( "cdc_control", self.broker_config, "something", get_cdc_storage(StorageKey.GROUPEDMESSAGES), ) ret = bootstrap.handle(None) assert ret[0] == ConsumerStateCompletionEvent.NO_SNAPSHOT assert kafka_consumer.commit_calls == 0
def bulk_load( *, storage_name: str, dest_table: str, source: str, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info("Start bulk load process for storage %s, from source %s", storage_name, source) storage = get_cdc_storage(StorageKey(storage_name)) table_writer = storage.get_table_writer() # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = table_writer.get_bulk_loader( snapshot_source, storage.get_postgres_table(), dest_table, storage.get_row_processor(), ) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( table_writer.get_batch_writer( environment.metrics, table_name=dest_table, chunk_size=settings.BULK_CLICKHOUSE_BUFFER, ), settings.BULK_CLICKHOUSE_BUFFER, JSONRowEncoder(), ) loader.load(writer)
def __init__(self) -> None: storage = get_cdc_storage(StorageKey.GROUPASSIGNEES) schema = storage.get_table_writer().get_schema() super().__init__( storages=[storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage), ), abstract_column_set=schema.get_columns(), join_relationships={ "owns": JoinRelationship( rhs_entity=EntityKey.EVENTS, columns=[("project_id", "project_id"), ("group_id", "group_id")], join_type=JoinType.LEFT, equivalences=[], ) }, writable_storage=storage, required_filter_columns=None, required_time_column=None, )
def test_snapshot_loaded(self, create_consumer: Mock) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message( 0, 0, b'{"snapshot-id":"abc123", "product":"somewhere-else", "tables": [], "event":"snapshot-init"}', False, ), build_confluent_kafka_message( 1, 0, b'{"snapshot-id":"abc123", "product":"snuba", "tables": ["sentry_groupedmessage"], "event":"snapshot-init"}', False, ), build_confluent_kafka_message( 2, 0, ( b'{"snapshot-id":"abc123", "event":"snapshot-loaded",' b'"transaction-info": {"xmin":123, "xmax":124, "xip-list": []}' b"}" ), False, ), build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer bootstrap = BootstrapState( "cdc_control", self.broker_config, "something", get_cdc_storage(StorageKey.GROUPEDMESSAGES), ) ret = bootstrap.handle(None) assert ret[0] == ConsumerStateCompletionEvent.SNAPSHOT_READY_RECEIVED assert kafka_consumer.commit_calls == 2
def test_init_snapshot(self, create_consumer) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message( 0, 0, b'{"snapshot-id":"abc123", "tables": ["sentry_groupedmessage"], "product":"snuba", "event":"snapshot-init"}', False, ), build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer bootstrap = BootstrapState( "cdc_control", self.broker_config, "something", get_cdc_storage(StorageKey.GROUPEDMESSAGES), ) ret = bootstrap.handle(None) assert ret[0] == ConsumerStateCompletionEvent.SNAPSHOT_INIT_RECEIVED assert kafka_consumer.commit_calls == 0
def test_snapshot_for_other_table(self, create_consumer) -> None: kafka_consumer = FakeConfluentKafkaConsumer() kafka_consumer.items = [ build_confluent_kafka_message( 0, 0, b'{"snapshot-id":"abc123", "tables": ["someone_else"], "product":"snuba", "event":"snapshot-init"}', False, ), build_confluent_kafka_message(0, 0, None, True), ] create_consumer.return_value = kafka_consumer bootstrap = BootstrapState( "cdc_control", "somewhere", "something", get_cdc_storage("groupedmessages"), ) ret = bootstrap.handle(None) assert ret[0] == ConsumerStateCompletionEvent.NO_SNAPSHOT assert kafka_consumer.commit_calls == 1
def confirm_load( *, control_topic: Optional[str], bootstrap_server: Sequence[str], storage_name: str, source: str, log_level: Optional[str] = None, ) -> None: """ Confirms the snapshot has been loaded by sending the snapshot-loaded message on the control topic. """ setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.loaded-snapshot") logger.info( "Sending load completion message for storage %s, from source %s", storage_name, source, ) storage_key = StorageKey(storage_name) storage = get_cdc_storage(storage_key) stream_loader = storage.get_table_writer().get_stream_loader() control_topic = control_topic or storage.get_default_control_topic() snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) descriptor = snapshot_source.get_descriptor() producer = Producer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_server, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) ) msg = SnapshotLoaded( id=descriptor.id, transaction_info=TransactionData( xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list, ), ) json_string = json.dumps(msg.to_dict()) def delivery_callback(error: KafkaError, message: Message) -> None: if error is not None: raise error else: logger.info("Message sent %r", message.value()) producer.produce( control_topic, value=json_string, on_delivery=delivery_callback, ) producer.flush()
def bulk_load( *, storage_name: str, dest_table: Optional[str], source: str, ignore_existing_data: bool, pre_processed: bool, show_progress: bool, log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info( "Start bulk load process for storage %s, from source %s", storage_name, source ) storage = get_cdc_storage(StorageKey(storage_name)) table_writer = storage.get_table_writer() # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = table_writer.get_bulk_loader( snapshot_source, storage.get_postgres_table(), storage.get_row_processor(), dest_table, ) # TODO: see whether we need to pass options to the writer def progress_callback(bar: progressbar.ProgressBar, progress: int) -> None: bar.update(progress) if show_progress: progress = progressbar.ProgressBar( max_value=snapshot_source.get_table_file_size(storage.get_postgres_table()) ) progress_func: Optional[ProgressCallback] = partial(progress_callback, progress) else: progress_func = None table_descriptor = snapshot_source.get_descriptor().get_table( storage.get_postgres_table() ) if pre_processed: writer = table_writer.get_bulk_writer( metrics=environment.metrics, encoding="gzip" if table_descriptor.zip else None, column_names=[c.name for c in table_descriptor.columns or []], table_name=dest_table, ) loader.load_preprocessed( writer, ignore_existing_data, progress_callback=progress_func ) else: buffer_writer = BufferedWriterWrapper( table_writer.get_batch_writer( environment.metrics, table_name=dest_table, chunk_size=settings.BULK_CLICKHOUSE_BUFFER, ), settings.BULK_CLICKHOUSE_BUFFER, JSONRowEncoder(), ) loader.load( buffer_writer, ignore_existing_data, progress_callback=progress_func )
def consumer( *, raw_events_topic: Optional[str], replacements_topic: Optional[str], commit_log_topic: Optional[str], control_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], dataset_name: Optional[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, stateful_consumer: bool, rapidjson_deserialize: bool, rapidjson_serialize: bool, log_level: Optional[str] = None, ) -> None: if not bootstrap_server: if dataset_name: bootstrap_server = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS, ) else: bootstrap_server = settings.DEFAULT_STORAGE_BROKERS.get( storage_name, settings.DEFAULT_BROKERS, ) setup_logging(log_level) setup_sentry() # TODO: Remove this once dataset_name is no longer being passed if dataset_name: dataset_writable_storage = get_dataset( dataset_name).get_writable_storage() if not dataset_writable_storage: raise click.ClickException( f"Dataset {dataset_name} has no writable storage") storage_name = {v: k for k, v in WRITABLE_STORAGES.items() }[dataset_writable_storage] consumer_builder = ConsumerBuilder( storage_name=storage_name, raw_topic=raw_events_topic, replacements_topic=replacements_topic, max_batch_size=max_batch_size, max_batch_time_ms=max_batch_time_ms, bootstrap_servers=bootstrap_server, group_id=consumer_group, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, rapidjson_deserialize=rapidjson_deserialize, rapidjson_serialize=rapidjson_serialize, ) if stateful_consumer: storage = get_cdc_storage(storage_name) assert storage is not None, "Only CDC storages have a control topic thus are supported." context = ConsumerStateMachine( consumer_builder=consumer_builder, topic=control_topic or storage.get_default_control_topic(), group_id=consumer_group, storage=storage, ) def handler(signum, frame) -> None: context.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) context.run() else: consumer = consumer_builder.build_base_consumer() def handler(signum, frame) -> None: consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) consumer.run()
def consumer( *, raw_events_topic: Optional[str], replacements_topic: Optional[str], commit_log_topic: Optional[str], control_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, stateful_consumer: bool, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], log_level: Optional[str] = None, profile_path: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) consumer_builder = ConsumerBuilder( storage_key=storage_key, raw_topic=raw_events_topic, replacements_topic=replacements_topic, max_batch_size=max_batch_size, max_batch_time_ms=max_batch_time_ms, bootstrap_servers=bootstrap_server, group_id=consumer_group, commit_log_topic=commit_log_topic, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, processes=processes, input_block_size=input_block_size, output_block_size=output_block_size, profile_path=profile_path, ) if stateful_consumer: storage = get_cdc_storage(storage_key) assert (storage is not None ), "Only CDC storages have a control topic thus are supported." context = ConsumerStateMachine( consumer_builder=consumer_builder, topic=control_topic or storage.get_default_control_topic(), group_id=consumer_group, storage=storage, ) def handler(signum: int, frame: Any) -> None: context.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) context.run() else: consumer = consumer_builder.build_base_consumer() def handler(signum: int, frame: Any) -> None: consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) consumer.run()