Exemplo n.º 1
0
def test_kafka_flush_on_big_message(kafka_cluster):
    # Create batchs of messages of size ~100Kb
    kafka_messages = 1000
    batch_messages = 1000
    messages = [
        json.dumps({
            'key': i,
            'value': 'x' * 100
        }) * batch_messages for i in range(kafka_messages)
    ]
    kafka_produce('flush', messages)

    instance.query('''
        DROP TABLE IF EXISTS test.view;
        DROP TABLE IF EXISTS test.consumer;
        CREATE TABLE test.kafka (key UInt64, value String)
            ENGINE = Kafka
            SETTINGS kafka_broker_list = 'kafka1:19092',
                     kafka_topic_list = 'flush',
                     kafka_group_name = 'flush',
                     kafka_format = 'JSONEachRow',
                     kafka_max_block_size = 10;
        CREATE TABLE test.view (key UInt64, value String)
            ENGINE = MergeTree
            ORDER BY key;
        CREATE MATERIALIZED VIEW test.consumer TO test.view AS
            SELECT * FROM test.kafka;
    ''')

    client = KafkaAdminClient(bootstrap_servers="localhost:9092")
    received = False
    while not received:
        try:
            offsets = client.list_consumer_group_offsets('flush')
            for topic, offset in offsets.items():
                if topic.topic == 'flush' and offset.offset == kafka_messages:
                    received = True
                    break
        except kafka.errors.GroupCoordinatorNotAvailableError:
            continue

    while True:
        result = instance.query('SELECT count() FROM test.view')
        if int(result) == kafka_messages * batch_messages:
            break

    instance.query('''
        DROP TABLE test.consumer;
        DROP TABLE test.view;
    ''')

    assert int(
        result
    ) == kafka_messages * batch_messages, 'ClickHouse lost some messages: {}'.format(
        result)
Exemplo n.º 2
0
def describe_group(bootstrap_server, consumer_group_name):
    kafka_admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_server)
    consumer_offset = {}
    for br in kafka_admin_client._client.cluster.brokers():
        this_group_offset = kafka_admin_client.list_consumer_group_offsets(
            group_id=consumer_group_name, group_coordinator_id=1001)
        for (topic, partition), (offset,
                                 metadata) in iteritems(this_group_offset):
            consumer_offset[partition] = offset
    namedtuple(consumer_group_name, consumer_offset)
    print(consumer_group_name)
Exemplo n.º 3
0
def test_kafka_flush_on_big_message(kafka_cluster):
    # Create batchs of messages of size ~100Kb
    kafka_messages = 10000
    batch_messages = 1000
    messages = [json.dumps({'key': i, 'value': 'x' * 100}) * batch_messages for i in range(kafka_messages)]
    kafka_produce('flush', messages)

    instance.query('''
        DROP TABLE IF EXISTS test.view;
        DROP TABLE IF EXISTS test.consumer;
        CREATE TABLE test.kafka (key UInt64, value String)
            ENGINE = Kafka
            SETTINGS
                kafka_broker_list = 'kafka1:19092',
                kafka_topic_list = 'flush',
                kafka_group_name = 'flush',
                kafka_format = 'JSONEachRow',
                kafka_max_block_size = 10;
        CREATE TABLE test.view (key UInt64, value String)
            ENGINE = MergeTree
            ORDER BY key;
        CREATE MATERIALIZED VIEW test.consumer TO test.view AS
            SELECT * FROM test.kafka;
    ''')

    client = KafkaAdminClient(bootstrap_servers="localhost:9092")
    received = False
    while not received:
        try:
            offsets = client.list_consumer_group_offsets('flush')
            for topic, offset in offsets.items():
                if topic.topic == 'flush' and offset.offset == kafka_messages:
                    received = True
                    break
        except kafka.errors.GroupCoordinatorNotAvailableError:
            continue

    for _ in range(20):
        time.sleep(1)
        result = instance.query('SELECT count() FROM test.view')
        if int(result) == kafka_messages*batch_messages:
            break

    assert int(result) == kafka_messages*batch_messages, 'ClickHouse lost some messages: {}'.format(result)
Exemplo n.º 4
0
def collect_topic_information(bootstrap_servers, old_consumer_group):
    """Gets a list of current topics being subscribed to by this consumer group that we may need to remove with the migration.

    Using the `list_consumer_group_offsets()` function since `describe_consumer_groups()` doesn't return proper data.

    :param bootstrap_servers: The Kafka brokers in the cluster to connect to.
    :param old_consumer_group: The consumer group we are migrating from.
    """
    adminClient = KafkaAdminClient(bootstrap_servers=bootstrap_servers)
    results = adminClient.list_consumer_group_offsets(old_consumer_group)

    topics = []
    for k, v in results.items():
        topic = k._asdict()['topic']
        if topic not in topics:
            topics.append(topic)

    adminClient.close()
    return topics
Exemplo n.º 5
0
def collect_old_consumer_group_offsets(bootstrap_servers, old_consumer_group,
                                       removed_topics):
    """
    Connects to the brokers specified to gather current offset information of the consumer group we're migrating from.

    :param bootstrap_servers: The Kafka brokers in the cluster to connect to.
    :param old_consumer_group: The consumer group we are migrating from.
    """
    adminClient = KafkaAdminClient(bootstrap_servers=bootstrap_servers)
    results = adminClient.list_consumer_group_offsets(old_consumer_group)

    delimeter = ','
    with open(OUTPUT_FILE, 'w') as f:
        for k, v in results.items():
            if len(removed_topics) > 0:
                topic = k._asdict()['topic']
                if topic in removed_topics:
                    continue

            f.write(str(k._asdict()['topic']) + delimeter)
            f.write(str(k._asdict()['partition']) + delimeter)
            f.write(str(v._asdict()['offset']) + '\n')

    adminClient.close()
Exemplo n.º 6
0
class KafkaUtils(object):
    def __init__(self, bootstrap_servers: list, topic: str, group_id: str):
        self.producer = KafkaProducer(bootstrap_servers=bootstrap_servers,
                                      api_version=(5, 5, 1),
                                      request_timeout_ms=1000)
        self.consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers)
        self.admin_client = KafkaAdminClient(
            bootstrap_servers=bootstrap_servers)
        self.bootstrap_servers = bootstrap_servers
        self.topic = topic
        self.group_id = group_id

    def has_consumer_group(self) -> bool:
        for group in self.admin_client.list_consumer_groups():
            if group[0] == self.group_id:
                return True
        return False

    def wait_until_consumer_group(self):
        do_until_true_with_timeout(self.has_consumer_group)

    def consume_messages_and_close(self):
        tmp_consumer = KafkaConsumer(self.topic,
                                     bootstrap_servers=self.bootstrap_servers,
                                     auto_offset_reset='earliest',
                                     group_id=self.group_id,
                                     consumer_timeout_ms=5000,
                                     enable_auto_commit=True)
        for msg in tmp_consumer:
            log.info(f"Found message [ {msg.value} ]")
        tmp_consumer.close()

    def ensure_topic_created(self):
        try:
            self.admin_client.create_topics([NewTopic(self.topic, 2, 1)])
        except TopicAlreadyExistsError:
            pass

    def _produce_record_sync(self, key: str, value: str):
        future = self.producer.send(self.topic, str.encode(value),
                                    str.encode(key))
        try:
            future.get(5)
            self.producer.flush(5)
        except KafkaError as e:
            logging.warning("Could not produce Kafka record!" + str(e))
            raise e

    def produce_element_with_delay(self, delay_ms: int):
        key = uuid()
        log.info(
            f"Producing element with key [ {key} ] and delay [ {delay_ms} ]")
        self._produce_record_sync(key, str(delay_ms))

    def _get_topic_partitions(self) -> list[TopicPartition]:
        return [
            TopicPartition(self.topic, partition)
            for partition in self.consumer.partitions_for_topic(self.topic)
        ]

    def get_latest_offsets(self) -> dict[int, int]:
        return convert_to_ordered_dict({
            topic_partition.partition: offset
            for (topic_partition, offset) in self.consumer.end_offsets(
                self._get_topic_partitions()).items()
        })

    def get_latest_offset_for_partition(self, partition: int) -> int:
        latest_offsets = self.get_latest_offsets()
        return latest_offsets.get(partition, -1)

    def get_offsets(self) -> dict[int, int]:
        return convert_to_ordered_dict({
            topic_partition.partition: offset_meta.offset
            for (topic_partition,
                 offset_meta) in self.admin_client.list_consumer_group_offsets(
                     self.group_id).items()
        })

    def get_offset_difference(self) -> OffsetDifference:
        return OffsetDifference(self.get_offsets(), self.get_latest_offsets())

    def wait_for_offset_catchup(self, timeout_seconds: int = 60):
        end_time = time.time() + timeout_seconds
        while time.time() < end_time:
            try:
                self.assert_group_up_to_date()
                return
            except Exception as e:
                log.info(e)
            time.sleep(1)
        raise Exception("Timed out!")

    def assert_group_up_to_date(self):
        assert self.get_offset_difference().is_up_to_date()

    def ensure_not_up_to_date_for_n_seconds(self, seconds: int):
        end_time = time.time() + seconds
        while time.time() < end_time:
            offset_difference = self.get_offset_difference()
            log.info("Offset difference: " + str(offset_difference))
            if offset_difference.is_up_to_date():
                raise Exception("Offsets are up to date!")
            time.sleep(2)