예제 #1
0
    def __init__(self) -> None:
        storage = get_cdc_storage("groupassignees")
        schema = storage.get_table_writer().get_schema()

        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            abstract_column_set=schema.get_columns(),
            writable_storage=storage,
        )
예제 #2
0
    def __init__(self) -> None:
        storage = get_cdc_storage(StorageKey.GROUPEDMESSAGES)
        schema = storage.get_table_writer().get_schema()

        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            abstract_column_set=schema.get_columns(),
            writable_storage=storage,
        )
예제 #3
0
    def __init__(self) -> None:
        storage = get_cdc_storage(StorageKey.GROUPEDMESSAGES)
        schema = storage.get_table_writer().get_schema()

        super().__init__(
            storages=[storage],
            query_pipeline_builder=SimplePipelineBuilder(
                query_plan_builder=SingleStorageQueryPlanBuilder(
                    storage=storage), ),
            abstract_column_set=schema.get_columns(),
            join_relationships={},
            writable_storage=storage,
        )
예제 #4
0
    def test_empty_topic(self, create_consumer) -> None:
        kafka_consumer = FakeConfluentKafkaConsumer()
        kafka_consumer.items = [
            build_confluent_kafka_message(0, 0, None, True),
        ]
        create_consumer.return_value = kafka_consumer

        bootstrap = BootstrapState(
            "cdc_control",
            self.broker_config,
            "something",
            get_cdc_storage(StorageKey.GROUPEDMESSAGES),
        )

        ret = bootstrap.handle(None)
        assert ret[0] == ConsumerStateCompletionEvent.NO_SNAPSHOT
        assert kafka_consumer.commit_calls == 0
예제 #5
0
def bulk_load(
    *,
    storage_name: str,
    dest_table: str,
    source: str,
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info("Start bulk load process for storage %s, from source %s",
                storage_name, source)

    storage = get_cdc_storage(StorageKey(storage_name))
    table_writer = storage.get_table_writer()

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = table_writer.get_bulk_loader(
        snapshot_source,
        storage.get_postgres_table(),
        dest_table,
        storage.get_row_processor(),
    )
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        table_writer.get_batch_writer(
            environment.metrics,
            table_name=dest_table,
            chunk_size=settings.BULK_CLICKHOUSE_BUFFER,
        ),
        settings.BULK_CLICKHOUSE_BUFFER,
        JSONRowEncoder(),
    )

    loader.load(writer)
예제 #6
0
    def __init__(self) -> None:
        storage = get_cdc_storage(StorageKey.GROUPASSIGNEES)
        schema = storage.get_table_writer().get_schema()

        super().__init__(
            storages=[storage],
            query_pipeline_builder=SimplePipelineBuilder(
                query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            ),
            abstract_column_set=schema.get_columns(),
            join_relationships={
                "owns": JoinRelationship(
                    rhs_entity=EntityKey.EVENTS,
                    columns=[("project_id", "project_id"), ("group_id", "group_id")],
                    join_type=JoinType.LEFT,
                    equivalences=[],
                )
            },
            writable_storage=storage,
            required_filter_columns=None,
            required_time_column=None,
        )
예제 #7
0
    def test_snapshot_loaded(self, create_consumer: Mock) -> None:
        kafka_consumer = FakeConfluentKafkaConsumer()
        kafka_consumer.items = [
            build_confluent_kafka_message(
                0,
                0,
                b'{"snapshot-id":"abc123", "product":"somewhere-else", "tables": [], "event":"snapshot-init"}',
                False,
            ),
            build_confluent_kafka_message(
                1,
                0,
                b'{"snapshot-id":"abc123", "product":"snuba", "tables": ["sentry_groupedmessage"], "event":"snapshot-init"}',
                False,
            ),
            build_confluent_kafka_message(
                2,
                0,
                (
                    b'{"snapshot-id":"abc123", "event":"snapshot-loaded",'
                    b'"transaction-info": {"xmin":123, "xmax":124, "xip-list": []}'
                    b"}"
                ),
                False,
            ),
            build_confluent_kafka_message(0, 0, None, True),
        ]
        create_consumer.return_value = kafka_consumer

        bootstrap = BootstrapState(
            "cdc_control",
            self.broker_config,
            "something",
            get_cdc_storage(StorageKey.GROUPEDMESSAGES),
        )

        ret = bootstrap.handle(None)
        assert ret[0] == ConsumerStateCompletionEvent.SNAPSHOT_READY_RECEIVED
        assert kafka_consumer.commit_calls == 2
예제 #8
0
    def test_init_snapshot(self, create_consumer) -> None:
        kafka_consumer = FakeConfluentKafkaConsumer()
        kafka_consumer.items = [
            build_confluent_kafka_message(
                0,
                0,
                b'{"snapshot-id":"abc123", "tables": ["sentry_groupedmessage"], "product":"snuba", "event":"snapshot-init"}',
                False,
            ),
            build_confluent_kafka_message(0, 0, None, True),
        ]
        create_consumer.return_value = kafka_consumer

        bootstrap = BootstrapState(
            "cdc_control",
            self.broker_config,
            "something",
            get_cdc_storage(StorageKey.GROUPEDMESSAGES),
        )

        ret = bootstrap.handle(None)
        assert ret[0] == ConsumerStateCompletionEvent.SNAPSHOT_INIT_RECEIVED
        assert kafka_consumer.commit_calls == 0
예제 #9
0
    def test_snapshot_for_other_table(self, create_consumer) -> None:
        kafka_consumer = FakeConfluentKafkaConsumer()
        kafka_consumer.items = [
            build_confluent_kafka_message(
                0,
                0,
                b'{"snapshot-id":"abc123", "tables": ["someone_else"], "product":"snuba", "event":"snapshot-init"}',
                False,
            ),
            build_confluent_kafka_message(0, 0, None, True),
        ]
        create_consumer.return_value = kafka_consumer

        bootstrap = BootstrapState(
            "cdc_control",
            "somewhere",
            "something",
            get_cdc_storage("groupedmessages"),
        )

        ret = bootstrap.handle(None)
        assert ret[0] == ConsumerStateCompletionEvent.NO_SNAPSHOT
        assert kafka_consumer.commit_calls == 1
예제 #10
0
def confirm_load(
    *,
    control_topic: Optional[str],
    bootstrap_server: Sequence[str],
    storage_name: str,
    source: str,
    log_level: Optional[str] = None,
) -> None:
    """
    Confirms the snapshot has been loaded by sending the
    snapshot-loaded message on the control topic.
    """

    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.loaded-snapshot")
    logger.info(
        "Sending load completion message for storage %s, from source %s",
        storage_name,
        source,
    )

    storage_key = StorageKey(storage_name)
    storage = get_cdc_storage(storage_key)

    stream_loader = storage.get_table_writer().get_stream_loader()

    control_topic = control_topic or storage.get_default_control_topic()

    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT, path=source,
    )

    descriptor = snapshot_source.get_descriptor()

    producer = Producer(
        build_kafka_producer_configuration(
            stream_loader.get_default_topic_spec().topic,
            bootstrap_servers=bootstrap_server,
            override_params={
                "partitioner": "consistent",
                "message.max.bytes": 50000000,  # 50MB, default is 1MB
            },
        )
    )

    msg = SnapshotLoaded(
        id=descriptor.id,
        transaction_info=TransactionData(
            xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list,
        ),
    )
    json_string = json.dumps(msg.to_dict())

    def delivery_callback(error: KafkaError, message: Message) -> None:
        if error is not None:
            raise error
        else:
            logger.info("Message sent %r", message.value())

    producer.produce(
        control_topic, value=json_string, on_delivery=delivery_callback,
    )

    producer.flush()
예제 #11
0
def bulk_load(
    *,
    storage_name: str,
    dest_table: Optional[str],
    source: str,
    ignore_existing_data: bool,
    pre_processed: bool,
    show_progress: bool,
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info(
        "Start bulk load process for storage %s, from source %s", storage_name, source
    )

    storage = get_cdc_storage(StorageKey(storage_name))
    table_writer = storage.get_table_writer()

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT, path=source,
    )

    loader = table_writer.get_bulk_loader(
        snapshot_source,
        storage.get_postgres_table(),
        storage.get_row_processor(),
        dest_table,
    )
    # TODO: see whether we need to pass options to the writer

    def progress_callback(bar: progressbar.ProgressBar, progress: int) -> None:
        bar.update(progress)

    if show_progress:
        progress = progressbar.ProgressBar(
            max_value=snapshot_source.get_table_file_size(storage.get_postgres_table())
        )
        progress_func: Optional[ProgressCallback] = partial(progress_callback, progress)
    else:
        progress_func = None

    table_descriptor = snapshot_source.get_descriptor().get_table(
        storage.get_postgres_table()
    )
    if pre_processed:
        writer = table_writer.get_bulk_writer(
            metrics=environment.metrics,
            encoding="gzip" if table_descriptor.zip else None,
            column_names=[c.name for c in table_descriptor.columns or []],
            table_name=dest_table,
        )
        loader.load_preprocessed(
            writer, ignore_existing_data, progress_callback=progress_func
        )
    else:
        buffer_writer = BufferedWriterWrapper(
            table_writer.get_batch_writer(
                environment.metrics,
                table_name=dest_table,
                chunk_size=settings.BULK_CLICKHOUSE_BUFFER,
            ),
            settings.BULK_CLICKHOUSE_BUFFER,
            JSONRowEncoder(),
        )
        loader.load(
            buffer_writer, ignore_existing_data, progress_callback=progress_func
        )
예제 #12
0
파일: consumer.py 프로젝트: ruezetle/snuba
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    control_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    dataset_name: Optional[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    stateful_consumer: bool,
    rapidjson_deserialize: bool,
    rapidjson_serialize: bool,
    log_level: Optional[str] = None,
) -> None:

    if not bootstrap_server:
        if dataset_name:
            bootstrap_server = settings.DEFAULT_DATASET_BROKERS.get(
                dataset_name,
                settings.DEFAULT_BROKERS,
            )
        else:
            bootstrap_server = settings.DEFAULT_STORAGE_BROKERS.get(
                storage_name,
                settings.DEFAULT_BROKERS,
            )

    setup_logging(log_level)
    setup_sentry()

    # TODO: Remove this once dataset_name is no longer being passed
    if dataset_name:
        dataset_writable_storage = get_dataset(
            dataset_name).get_writable_storage()
        if not dataset_writable_storage:
            raise click.ClickException(
                f"Dataset {dataset_name} has no writable storage")

        storage_name = {v: k
                        for k, v in WRITABLE_STORAGES.items()
                        }[dataset_writable_storage]

    consumer_builder = ConsumerBuilder(
        storage_name=storage_name,
        raw_topic=raw_events_topic,
        replacements_topic=replacements_topic,
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        bootstrap_servers=bootstrap_server,
        group_id=consumer_group,
        commit_log_topic=commit_log_topic,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
        rapidjson_deserialize=rapidjson_deserialize,
        rapidjson_serialize=rapidjson_serialize,
    )

    if stateful_consumer:
        storage = get_cdc_storage(storage_name)
        assert storage is not None, "Only CDC storages have a control topic thus are supported."
        context = ConsumerStateMachine(
            consumer_builder=consumer_builder,
            topic=control_topic or storage.get_default_control_topic(),
            group_id=consumer_group,
            storage=storage,
        )

        def handler(signum, frame) -> None:
            context.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        context.run()
    else:
        consumer = consumer_builder.build_base_consumer()

        def handler(signum, frame) -> None:
            consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        consumer.run()
예제 #13
0
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    control_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    stateful_consumer: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    profile_path: Optional[str] = None,
) -> None:

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)

    consumer_builder = ConsumerBuilder(
        storage_key=storage_key,
        raw_topic=raw_events_topic,
        replacements_topic=replacements_topic,
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        bootstrap_servers=bootstrap_server,
        group_id=consumer_group,
        commit_log_topic=commit_log_topic,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
        processes=processes,
        input_block_size=input_block_size,
        output_block_size=output_block_size,
        profile_path=profile_path,
    )

    if stateful_consumer:
        storage = get_cdc_storage(storage_key)
        assert (storage is not None
                ), "Only CDC storages have a control topic thus are supported."
        context = ConsumerStateMachine(
            consumer_builder=consumer_builder,
            topic=control_topic or storage.get_default_control_topic(),
            group_id=consumer_group,
            storage=storage,
        )

        def handler(signum: int, frame: Any) -> None:
            context.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        context.run()
    else:
        consumer = consumer_builder.build_base_consumer()

        def handler(signum: int, frame: Any) -> None:
            consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        consumer.run()