def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.poll()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.consume()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.commit()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.seek([TopicPartition("test", 0, 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
    assert 'Consumer closed' == str(ex.value)
def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.poll()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.consume()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.commit()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.seek([TopicPartition("test", 0, 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
    assert ex.match('Consumer closed')
Exemplo n.º 3
0
class KafkaConsumer(object):
    def __init__(self, group_id, topic):
        self.client = Consumer({
            'bootstrap.servers': KAFKA_SERVER_HOSTS,
            'group.id': group_id,
            'session.timeout.ms': 6000,
            'default.topic.config': {
                'auto.offset.reset': 'smallest'
            }
        })
        self.topic = topic

    def query_kafka(self, max_part):
        for p_id in range(0, max_part):
            tp = TopicPartition(self.topic, p_id)
            committed = self.client.committed([tp])
            watermark_offsets = self.client.get_watermark_offsets(tp)
            c_offset = committed[0].offset
            partition = committed[0].partition
            min_offset = watermark_offsets[0]
            max_offset = watermark_offsets[1]
            print("%d %d %d %d %d" % (partition, min_offset, c_offset,
                                      max_offset, max_offset - c_offset))

    def reset_kafka(self, tps):
        for tp in tps:
            self.client.assign([tp])
            print(tp)
            self.client.poll()

    def close(self):
        self.client.close()
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb (err, partitions):
        pass

    kc = Consumer({'group.id':'test', 'socket.timeout.ms':'100',
                   'session.timeout.ms': 1000, # Avoid close() blocking too long
                   'on_commit': dummy_commit_cb})

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke (consumer, partitions):
        pass

    kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    partitions = list(map(lambda p: TopicPartition("test", p), range(0,100,3)))
    kc.assign(partitions)

    kc.unassign()

    kc.commit(async=True)

    try:
        kc.commit(async=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == -1001]) == len(partitions)

    try:
        offsets = kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT


    kc.close()
Exemplo n.º 5
0
def test_send_offsets_committed_transaction(kafka_cluster):
    input_topic = kafka_cluster.create_topic("input_topic")
    output_topic = kafka_cluster.create_topic("output_topic")
    error_cb = prefixed_error_cb('test_send_offsets_committed_transaction')
    producer = kafka_cluster.producer({
        'client.id': 'producer1',
        'transactional.id': 'example_transactional_id',
        'error_cb': error_cb,
    })

    consumer_conf = {
        'group.id': str(uuid1()),
        'auto.offset.reset': 'earliest',
        'enable.auto.commit': False,
        'enable.partition.eof': True,
        'error_cb': error_cb
    }
    consumer_conf.update(kafka_cluster.client_conf())
    consumer = Consumer(consumer_conf)

    kafka_cluster.seed_topic(input_topic)
    consumer.subscribe([input_topic])

    read_all_msgs(consumer)

    producer.init_transactions()
    transactional_produce(producer, output_topic, 100)

    consumer_position = consumer.position(consumer.assignment())
    group_metadata = consumer.consumer_group_metadata()
    print(
        "=== Sending offsets {} to transaction ===".format(consumer_position))
    producer.send_offsets_to_transaction(consumer_position, group_metadata)
    producer.commit_transaction()

    producer2 = kafka_cluster.producer({
        'client.id': 'producer2',
        'transactional.id': 'example_transactional_id',
        'error_cb': error_cb
    })

    # ensure offset commits are visible prior to sending FetchOffsets request
    producer2.init_transactions()

    committed_offsets = consumer.committed(consumer.assignment())
    print("=== Committed offsets for {} ===".format(committed_offsets))

    assert [tp.offset for tp in committed_offsets] == [100]

    consumer.close()
Exemplo n.º 6
0
 def get_consumer_offsets(
     self, topics: List[str], ignore_group_regex: str = IGNORE_GROUP_REGEX
         ) -> List[Offset]:
     broker_topics = self.client.list_topics().topics
     partitions = []
     for topic_name in topics:
         partitions.extend([TopicPartition(topic_name, k)
                            for k in broker_topics[topic_name].partitions])
     offsets = []
     for consumer_group in self.get_consumer_groups():
         if re.findall(ignore_group_regex, consumer_group):
             logger.debug(f'Ignoring consumer group: {consumer_group}')
             continue
         consumer = Consumer({**self.config, 'group.id': consumer_group})
         for tp in consumer.committed(partitions, timeout=10):
             if tp.offset == -1001:
                 continue
             offset = Offset(consumer_group, tp.topic,
                             tp.partition, tp.offset)
             offsets.append(offset)
     return offsets
Exemplo n.º 7
0
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({
        'group.id': 'test',
        'socket.timeout.ms': '100',
        'session.timeout.ms': 1000,  # Avoid close() blocking too long
        'on_commit': dummy_commit_cb
    })

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"],
                 on_assign=dummy_assign_revoke,
                 on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(
        map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0],
                                          timeout=0.5,
                                          cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions
                if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._TRANSPORT)

    kc.close()
Exemplo n.º 8
0
class SynchronizedConsumer(object):
    """
    This class implements the framework for a consumer that is intended to only
    consume messages that have already been consumed and committed by members
    of another consumer group.

    This works similarly to the Kafka built-in ``__consumer_offsets`` topic.
    The consumer group that is being "followed" (the one that must make
    progress for our consumer here to make progress, identified by the
    ``synchronize_commit_group`` constructor parameter/instance attribute) must
    report its offsets to a topic (identified by the ``commit_log_topic``
    constructor parameter/instance attribute). This consumer subscribes to both
    commit log topic, as well as the topic(s) that we are actually interested
    in consuming messages from. The messages received from the commit log topic
    control whether or not consumption from partitions belonging to the main
    topic is paused, resumed, or allowed to continue in its current state
    without changes.

    The furthest point in any partition that this consumer should ever consume
    to is the maximum offset that has been recorded to the commit log topic for
    that partition. If the offsets recorded to that topic move
    non-monotonically (due to an intentional offset rollback, for instance)
    this consumer *may* consume up to the highest watermark point. (The
    implementation here tries to pause consuming from the partition as soon as
    possible, but this makes no explicit guarantees about that behavior.)
    """
    initial_offset_reset_strategies = {
        'earliest': get_earliest_offset,
        'latest': get_latest_offset,
    }

    def __init__(self, bootstrap_servers, consumer_group, commit_log_topic,
                 synchronize_commit_group, initial_offset_reset='latest', on_commit=None):
        self.bootstrap_servers = bootstrap_servers
        self.consumer_group = consumer_group
        self.commit_log_topic = commit_log_topic
        self.synchronize_commit_group = synchronize_commit_group
        self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset]

        self.__partition_state_manager = SynchronizedPartitionStateManager(
            self.__on_partition_state_change)
        self.__commit_log_consumer, self.__commit_log_consumer_stop_request = self.__start_commit_log_consumer()

        self.__positions = {}

        def commit_callback(error, partitions):
            if on_commit is not None:
                return on_commit(error, partitions)

        consumer_configuration = {
            'bootstrap.servers': self.bootstrap_servers,
            'group.id': self.consumer_group,
            'enable.auto.commit': 'false',
            'enable.auto.offset.store': 'true',
            'enable.partition.eof': 'false',
            'default.topic.config': {
                'auto.offset.reset': 'error',
            },
            'on_commit': commit_callback,
        }

        self.__consumer = Consumer(consumer_configuration)

    def __start_commit_log_consumer(self, timeout=None):
        """
        Starts running the commit log consumer.
        """
        stop_request_event = threading.Event()
        start_event = threading.Event()
        result = execute(
            functools.partial(
                run_commit_log_consumer,
                bootstrap_servers=self.bootstrap_servers,
                consumer_group='{}:sync:{}'.format(self.consumer_group, uuid.uuid1().hex),
                commit_log_topic=self.commit_log_topic,
                synchronize_commit_group=self.synchronize_commit_group,
                partition_state_manager=self.__partition_state_manager,
                start_event=start_event,
                stop_request_event=stop_request_event,
            ),
        )
        start_event.wait(timeout)
        return result, stop_request_event

    def __check_commit_log_consumer_running(self):
        if not self.__commit_log_consumer.running():
            try:
                result = self.__commit_log_consumer.result(timeout=0)  # noqa
            except TimeoutError:
                pass  # not helpful

            raise Exception('Commit log consumer unexpectedly exit!')

    def __on_partition_state_change(
            self, topic, partition, previous_state_and_offsets, current_state_and_offsets):
        """
        Callback that is invoked when a partition state changes.
        """
        logger.debug('State change for %r: %r to %r', (topic, partition),
                     previous_state_and_offsets, current_state_and_offsets)

        current_state, current_offsets = current_state_and_offsets
        if current_offsets.local is None:
            # It only makes sense to manipulate the consumer if we've got an
            # assignment. (This block should only be entered at startup if the
            # remote offsets are retrieved from the commit log before the local
            # consumer has received its assignment.)
            return

        # TODO: This will be called from the commit log consumer thread, so need
        # to verify that calling the ``consumer.{pause,resume}`` methods is
        # thread safe!
        if current_state in (SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED,
                             SynchronizedPartitionState.REMOTE_BEHIND):
            self.__consumer.pause([TopicPartition(topic, partition, current_offsets.local)])
        elif current_state is SynchronizedPartitionState.LOCAL_BEHIND:
            self.__consumer.resume([TopicPartition(topic, partition, current_offsets.local)])
        else:
            raise NotImplementedError('Unexpected partition state: %s' % (current_state,))

    def subscribe(self, topics, on_assign=None, on_revoke=None):
        """
        Subscribe to a topic.
        """
        self.__check_commit_log_consumer_running()

        def assignment_callback(consumer, assignment):
            # Since ``auto.offset.reset`` is set to ``error`` to force human
            # interaction on an offset reset, we have to explicitly specify the
            # starting offset if no offset has been committed for this topic during
            # the ``__consumer_offsets`` topic retention period.
            assignment = {
                (i.topic, i.partition): self.__positions.get((i.topic, i.partition)) for i in assignment
            }

            for i in self.__consumer.committed([TopicPartition(topic, partition) for (
                    topic, partition), offset in assignment.items() if offset is None]):
                k = (i.topic, i.partition)
                if i.offset > -1:
                    assignment[k] = i.offset
                else:
                    assignment[k] = self.initial_offset_reset(consumer, i.topic, i.partition)

            self.__consumer.assign([TopicPartition(topic, partition, offset)
                                    for (topic, partition), offset in assignment.items()])

            for (topic, partition), offset in assignment.items():
                # Setting the local offsets will either cause the partition to be
                # paused (if the remote offset is unknown or the local offset is
                # not trailing the remote offset) or resumed.
                self.__partition_state_manager.set_local_offset(topic, partition, offset)
                self.__positions[(topic, partition)] = offset

            if on_assign is not None:
                on_assign(self, [TopicPartition(topic, partition)
                                 for topic, partition in assignment.keys()])

        def revocation_callback(consumer, assignment):
            for item in assignment:
                # TODO: This should probably also be removed from the state manager.
                self.__positions.pop((item.topic, item.partition))

            if on_revoke is not None:
                on_revoke(self, assignment)

        self.__consumer.subscribe(
            topics,
            on_assign=assignment_callback,
            on_revoke=revocation_callback)

    def poll(self, timeout):
        self.__check_commit_log_consumer_running()

        message = self.__consumer.poll(timeout)
        if message is None:
            return

        if message.error() is not None:
            return message

        self.__partition_state_manager.validate_local_message(
            message.topic(), message.partition(), message.offset())
        self.__partition_state_manager.set_local_offset(
            message.topic(), message.partition(), message.offset() + 1)
        self.__positions[(message.topic(), message.partition())] = message.offset() + 1

        return message

    def commit(self, *args, **kwargs):
        self.__check_commit_log_consumer_running()

        return self.__consumer.commit(*args, **kwargs)

    def close(self):
        self.__check_commit_log_consumer_running()

        self.__commit_log_consumer_stop_request.set()
        try:
            self.__consumer.close()
        finally:
            self.__commit_log_consumer.result()
Exemplo n.º 9
0
class SynchronizedConsumer:
    """
    This class implements the framework for a consumer that is intended to only
    consume messages that have already been consumed and committed by members
    of another consumer group.

    This works similarly to the Kafka built-in ``__consumer_offsets`` topic.
    The consumer group that is being "followed" (the one that must make
    progress for our consumer here to make progress, identified by the
    ``synchronize_commit_group`` constructor parameter/instance attribute) must
    report its offsets to a topic (identified by the ``commit_log_topic``
    constructor parameter/instance attribute). This consumer subscribes to both
    commit log topic, as well as the topic(s) that we are actually interested
    in consuming messages from. The messages received from the commit log topic
    control whether or not consumption from partitions belonging to the main
    topic is paused, resumed, or allowed to continue in its current state
    without changes.

    The furthest point in any partition that this consumer should ever consume
    to is the maximum offset that has been recorded to the commit log topic for
    that partition. If the offsets recorded to that topic move
    non-monotonically (due to an intentional offset rollback, for instance)
    this consumer *may* consume up to the highest watermark point. (The
    implementation here tries to pause consuming from the partition as soon as
    possible, but this makes no explicit guarantees about that behavior.)
    """

    initial_offset_reset_strategies = {
        "earliest": get_earliest_offset,
        "latest": get_latest_offset
    }

    def __init__(
        self,
        cluster_name,
        consumer_group,
        commit_log_topic,
        synchronize_commit_group,
        initial_offset_reset="latest",
        on_commit=None,
    ):
        self.cluster_name = cluster_name
        self.consumer_group = consumer_group
        self.commit_log_topic = commit_log_topic
        self.synchronize_commit_group = synchronize_commit_group
        self.initial_offset_reset = self.initial_offset_reset_strategies[
            initial_offset_reset]

        self.__partition_state_manager = SynchronizedPartitionStateManager(
            self.__on_partition_state_change)
        (
            self.__commit_log_consumer,
            self.__commit_log_consumer_stop_request,
        ) = self.__start_commit_log_consumer()

        self.__positions = {}

        def commit_callback(error, partitions):
            if on_commit is not None:
                return on_commit(error, partitions)

        consumer_configuration = kafka_config.get_kafka_consumer_cluster_options(
            cluster_name,
            override_params={
                "group.id": self.consumer_group,
                "enable.auto.commit": "false",
                "enable.auto.offset.store": "true",
                "enable.partition.eof": "false",
                "default.topic.config": {
                    "auto.offset.reset": "error"
                },
                "on_commit": commit_callback,
            },
        )

        self.__consumer = Consumer(consumer_configuration)

    def __start_commit_log_consumer(self, timeout=None):
        """
        Starts running the commit log consumer.
        """
        stop_request_event = threading.Event()
        start_event = threading.Event()
        result = execute(
            functools.partial(
                run_commit_log_consumer,
                cluster_name=self.cluster_name,
                consumer_group=f"{self.consumer_group}:sync:{uuid.uuid1().hex}",
                commit_log_topic=self.commit_log_topic,
                synchronize_commit_group=self.synchronize_commit_group,
                partition_state_manager=self.__partition_state_manager,
                start_event=start_event,
                stop_request_event=stop_request_event,
            ))
        start_event.wait(timeout)
        return result, stop_request_event

    def __check_commit_log_consumer_running(self):
        if not self.__commit_log_consumer.running():
            try:
                result = self.__commit_log_consumer.result(timeout=0)  # noqa
            except TimeoutError:
                pass  # not helpful

            raise Exception("Commit log consumer unexpectedly exit!")

    def __on_partition_state_change(self, topic, partition,
                                    previous_state_and_offsets,
                                    current_state_and_offsets):
        """
        Callback that is invoked when a partition state changes.
        """
        logger.debug(
            "State change for %r: %r to %r",
            (topic, partition),
            previous_state_and_offsets,
            current_state_and_offsets,
        )

        current_state, current_offsets = current_state_and_offsets
        if current_offsets.local is None:
            # It only makes sense to manipulate the consumer if we've got an
            # assignment. (This block should only be entered at startup if the
            # remote offsets are retrieved from the commit log before the local
            # consumer has received its assignment.)
            return

        # TODO: This will be called from the commit log consumer thread, so need
        # to verify that calling the ``consumer.{pause,resume}`` methods is
        # thread safe!
        if current_state in (
                SynchronizedPartitionState.UNKNOWN,
                SynchronizedPartitionState.SYNCHRONIZED,
                SynchronizedPartitionState.REMOTE_BEHIND,
        ):
            self.__consumer.pause(
                [TopicPartition(topic, partition, current_offsets.local)])
        elif current_state is SynchronizedPartitionState.LOCAL_BEHIND:
            self.__consumer.resume(
                [TopicPartition(topic, partition, current_offsets.local)])
        else:
            raise NotImplementedError(
                f"Unexpected partition state: {current_state}")

    def subscribe(self, topics, on_assign=None, on_revoke=None):
        """
        Subscribe to a topic.
        """
        self.__check_commit_log_consumer_running()

        def assignment_callback(consumer, assignment):
            # Since ``auto.offset.reset`` is set to ``error`` to force human
            # interaction on an offset reset, we have to explicitly specify the
            # starting offset if no offset has been committed for this topic during
            # the ``__consumer_offsets`` topic retention period.
            assignment = {(i.topic, i.partition): self.__positions.get(
                (i.topic, i.partition))
                          for i in assignment}

            for i in self.__consumer.committed([
                    TopicPartition(topic, partition)
                    for (topic, partition), offset in assignment.items()
                    if offset is None
            ]):
                k = (i.topic, i.partition)
                if i.offset > -1:
                    assignment[k] = i.offset
                else:
                    assignment[k] = self.initial_offset_reset(
                        consumer, i.topic, i.partition)

            self.__consumer.assign([
                TopicPartition(topic, partition, offset)
                for (topic, partition), offset in assignment.items()
            ])

            for (topic, partition), offset in assignment.items():
                # Setting the local offsets will either cause the partition to be
                # paused (if the remote offset is unknown or the local offset is
                # not trailing the remote offset) or resumed.
                self.__partition_state_manager.set_local_offset(
                    topic, partition, offset)
                self.__positions[(topic, partition)] = offset

            if on_assign is not None:
                on_assign(
                    self,
                    [
                        TopicPartition(topic, partition)
                        for topic, partition in assignment.keys()
                    ],
                )

        def revocation_callback(consumer, assignment):
            for item in assignment:
                # TODO: This should probably also be removed from the state manager.
                self.__positions.pop((item.topic, item.partition))

            if on_revoke is not None:
                on_revoke(self, assignment)

        self.__consumer.subscribe(topics,
                                  on_assign=assignment_callback,
                                  on_revoke=revocation_callback)

    def poll(self, timeout):
        self.__check_commit_log_consumer_running()

        message = self.__consumer.poll(timeout)
        if message is None:
            return

        if message.error() is not None:
            return message

        self.__partition_state_manager.validate_local_message(
            message.topic(), message.partition(), message.offset())
        self.__partition_state_manager.set_local_offset(
            message.topic(), message.partition(),
            message.offset() + 1)
        self.__positions[(message.topic(),
                          message.partition())] = message.offset() + 1

        return message

    def commit(self, *args, **kwargs):
        self.__check_commit_log_consumer_running()

        return self.__consumer.commit(*args, **kwargs)

    def close(self):
        self.__check_commit_log_consumer_running()

        self.__commit_log_consumer_stop_request.set()
        try:
            self.__consumer.close()
        finally:
            self.__commit_log_consumer.result()
Exemplo n.º 10
0
class KafkaConsumer(Consumer[TPayload]):
    """
    The behavior of this consumer differs slightly from the Confluent
    consumer during rebalancing operations. Whenever a partition is assigned
    to this consumer, offsets are *always* automatically reset to the
    committed offset for that partition (or if no offsets have been committed
    for that partition, the offset is reset in accordance with the
    ``auto.offset.reset`` configuration value.) This causes partitions that
    are maintained across a rebalance to have the same offset management
    behavior as a partition that is moved from one consumer to another. To
    prevent uncommitted messages from being consumed multiple times,
    ``commit`` should be called in the partition revocation callback.

    The behavior of ``auto.offset.reset`` also differs slightly from the
    Confluent consumer as well: offsets are only reset during initial
    assignment or subsequent rebalancing operations. Any other circumstances
    that would otherwise lead to preemptive offset reset (e.g. the consumer
    tries to read a message that is before the earliest offset, or the
    consumer attempts to read a message that is after the latest offset) will
    cause an exception to be thrown, rather than resetting the offset, as
    this could lead to chunks messages being replayed or skipped, depending
    on the circumstances. This also means that if the committed offset is no
    longer available (such as when reading older messages from the log and
    those messages expire, or reading newer messages from the log and the
    leader crashes and partition ownership fails over to an out-of-date
    replica), the consumer will fail-stop rather than reset to the value of
    ``auto.offset.reset``.
    """

    # Set of logical offsets that do not correspond to actual log positions.
    # These offsets should be considered an implementation detail of the Kafka
    # consumer and not used publically.
    # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25
    LOGICAL_OFFSETS = frozenset(
        [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID])

    def __init__(
        self,
        configuration: Mapping[str, Any],
        codec: Codec[KafkaPayload, TPayload],
        *,
        commit_retry_policy: Optional[RetryPolicy] = None,
    ) -> None:
        if commit_retry_policy is None:
            commit_retry_policy = NoRetryPolicy()

        auto_offset_reset = configuration.get("auto.offset.reset", "largest")
        if auto_offset_reset in {"smallest", "earliest", "beginning"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_earliest)
        elif auto_offset_reset in {"largest", "latest", "end"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_latest)
        elif auto_offset_reset == "error":
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_error)
        else:
            raise ValueError(
                "invalid value for 'auto.offset.reset' configuration")

        if (as_kafka_configuration_bool(
                configuration.get("enable.auto.commit", "true")) is not False):
            raise ValueError(
                "invalid value for 'enable.auto.commit' configuration")

        if (as_kafka_configuration_bool(
                configuration.get("enable.auto.offset.store", "true"))
                is not False):
            raise ValueError(
                "invalid value for 'enable.auto.offset.store' configuration")

        # NOTE: Offsets are explicitly managed as part of the assignment
        # callback, so preemptively resetting offsets is not enabled.
        self.__consumer = ConfluentConsumer({
            **configuration, "auto.offset.reset":
            "error"
        })

        self.__codec = codec

        self.__offsets: MutableMapping[Partition, int] = {}
        self.__staged_offsets: MutableMapping[Partition, int] = {}
        self.__paused: Set[Partition] = set()

        self.__commit_retry_policy = commit_retry_policy

        self.__state = KafkaConsumerState.CONSUMING

    def __resolve_partition_offset_earliest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       low)

    def __resolve_partition_offset_latest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       high)

    def __resolve_partition_offset_error(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        raise ConsumerError("unable to resolve partition offsets")

    def subscribe(
        self,
        topics: Sequence[Topic],
        on_assign: Optional[Callable[[Mapping[Partition, int]], None]] = None,
        on_revoke: Optional[Callable[[Sequence[Partition]], None]] = None,
    ) -> None:
        """
        Subscribe to topics. This replaces a previous subscription.

        This method does not block. The subscription may not be fulfilled
        immediately: instead, the ``on_assign`` and ``on_revoke`` callbacks
        are called when the subscription state changes with the updated
        assignment for this consumer.

        If provided, the ``on_assign`` callback is called with a mapping of
        partitions to their offsets (at this point, the working offset and the
        committed offset are the same for each partition) on each subscription
        change. Similarly, the ``on_revoke`` callback (if provided) is called
        with a sequence of partitions that are being removed from this
        consumer's assignment. (This callback does not include the offsets,
        as the working offset and committed offset may differ, in some cases
        by substantial margin.)

        Raises an ``InvalidState`` exception if called on a closed consumer.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        def assignment_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.ASSIGNING

            try:
                assignment: MutableSequence[ConfluentTopicPartition] = []

                for partition in self.__consumer.committed(partitions):
                    if partition.offset >= 0:
                        assignment.append(partition)
                    elif partition.offset == OFFSET_INVALID:
                        assignment.append(
                            self.__resolve_partition_starting_offset(
                                partition))
                    else:
                        raise ValueError("received unexpected offset")

                offsets: MutableMapping[Partition, int] = {
                    Partition(Topic(i.topic), i.partition): i.offset
                    for i in assignment
                }
                self.__seek(offsets)

                # Ensure that all partitions are resumed on assignment to avoid
                # carrying over state from a previous assignment.
                self.__consumer.resume([
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset)
                    for partition, offset in offsets.items()
                ])

                for partition in offsets:
                    self.__paused.discard(partition)
            except Exception:
                self.__state = KafkaConsumerState.ERROR
                raise

            try:
                if on_assign is not None:
                    on_assign(offsets)
            finally:
                self.__state = KafkaConsumerState.CONSUMING

        def revocation_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.REVOKING

            partitions = [
                Partition(Topic(i.topic), i.partition) for i in partitions
            ]

            try:
                if on_revoke is not None:
                    on_revoke(partitions)
            finally:
                for partition in partitions:
                    # Staged offsets are deleted during partition revocation to
                    # prevent later committing offsets for partitions that are
                    # no longer owned by this consumer.
                    if partition in self.__staged_offsets:
                        logger.warning(
                            "Dropping staged offset for revoked partition (%r)!",
                            partition,
                        )
                        del self.__staged_offsets[partition]

                    try:
                        self.__offsets.pop(partition)
                    except KeyError:
                        # If there was an error during assignment, this
                        # partition may have never been added to the offsets
                        # mapping.
                        logger.warning(
                            "failed to delete offset for unknown partition: %r",
                            partition,
                        )

                    self.__paused.discard(partition)

                self.__state = KafkaConsumerState.CONSUMING

        self.__consumer.subscribe(
            [topic.name for topic in topics],
            on_assign=assignment_callback,
            on_revoke=revocation_callback,
        )

    def unsubscribe(self) -> None:
        """
        Unsubscribe from topics.

        Raises an ``InvalidState`` exception if called on a closed consumer.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        self.__consumer.unsubscribe()

    def poll(self,
             timeout: Optional[float] = None) -> Optional[Message[TPayload]]:
        """
        Return the next message available to be consumed, if one is
        available. If no message is available, this method will block up to
        the ``timeout`` value before returning ``None``. A timeout of
        ``0.0`` represents "do not block", while a timeout of ``None``
        represents "block until a message is available (or forever)".

        Calling this method may also invoke subscription state change
        callbacks.

        This method may also raise an ``EndOfPartition`` error (a subtype of
        ``ConsumerError``) when the consumer has reached the end of a
        partition that it is subscribed to and no additional messages are
        available. The ``partition`` attribute of the raised exception
        specifies the end which partition has been reached. (Since this
        consumer is multiplexing a set of partitions, this exception does not
        mean that *all* of the partitions that the consumer is subscribed to
        do not have any messages, just that it has reached the end of one of
        them. This also does not mean that additional messages won't be
        available in future poll calls.) Not every backend implementation
        supports this feature or is configured to raise in this scenario.

        Raises an ``InvalidState`` exception if called on a closed consumer.

        Raises a ``TransportError`` for various other consumption-related
        errors.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        message: Optional[ConfluentMessage] = self.__consumer.poll(
            *[timeout] if timeout is not None else [])
        if message is None:
            return None

        error: Optional[KafkaError] = message.error()
        if error is not None:
            code = error.code()
            if code == KafkaError._PARTITION_EOF:
                raise EndOfPartition(
                    Partition(Topic(message.topic()), message.partition()),
                    message.offset(),
                )
            elif code == KafkaError._TRANSPORT:
                raise TransportError(str(error))
            else:
                raise ConsumerError(str(error))

        headers: Optional[Headers] = message.headers()
        result = Message(
            Partition(Topic(message.topic()), message.partition()),
            message.offset(),
            self.__codec.decode(
                KafkaPayload(
                    message.key(),
                    message.value(),
                    headers if headers is not None else [],
                )),
            datetime.utcfromtimestamp(message.timestamp()[1] / 1000.0),
        )

        self.__offsets[result.partition] = result.get_next_offset()

        return result

    def tell(self) -> Mapping[Partition, int]:
        """
        Return the read offsets for all assigned partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return self.__offsets

    def __validate_offsets(self, offsets: Mapping[Partition, int]) -> None:
        invalid_offsets: Mapping[Partition, int] = {
            partition: offset
            for partition, offset in offsets.items() if offset < 0
        }

        if invalid_offsets:
            raise ConsumerError(f"invalid offsets: {invalid_offsets!r}")

    def __seek(self, offsets: Mapping[Partition, int]) -> None:
        self.__validate_offsets(offsets)

        if self.__state is KafkaConsumerState.ASSIGNING:
            # Calling ``seek`` on the Confluent consumer from an assignment
            # callback will throw an "Erroneous state" error. Instead,
            # partition offsets have to be initialized by calling ``assign``.
            self.__consumer.assign([
                ConfluentTopicPartition(partition.topic.name, partition.index,
                                        offset)
                for partition, offset in offsets.items()
            ])
        else:
            for partition, offset in offsets.items():
                self.__consumer.seek(
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset))

        self.__offsets.update(offsets)

    def seek(self, offsets: Mapping[Partition, int]) -> None:
        """
        Change the read offsets for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError("cannot seek on unassigned partitions")

        self.__seek(offsets)

    def pause(self, partitions: Sequence[Partition]) -> None:
        """
        Pause the consumption of messages for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if set(partitions) - self.__offsets.keys():
            raise ConsumerError("cannot pause unassigned partitions")

        self.__consumer.pause([
            ConfluentTopicPartition(partition.topic.name, partition.index)
            for partition in partitions
        ])

        self.__paused.update(partitions)

        # XXX: Seeking to a specific partition offset and immediately pausing
        # that partition causes the seek to be ignored for some reason.
        self.seek({
            partition: offset
            for partition, offset in self.__offsets.items()
            if partition in partitions
        })

    def resume(self, partitions: Sequence[Partition]) -> None:
        """
        Resume the consumption of messages for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if set(partitions) - self.__offsets.keys():
            raise ConsumerError("cannot resume unassigned partitions")

        self.__consumer.resume([
            ConfluentTopicPartition(partition.topic.name, partition.index)
            for partition in partitions
        ])

        for partition in partitions:
            self.__paused.discard(partition)

    def paused(self) -> Sequence[Partition]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return [*self.__paused]

    def stage_offsets(self, offsets: Mapping[Partition, int]) -> None:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError(
                "cannot stage offsets for unassigned partitions")

        self.__validate_offsets(offsets)

        # TODO: Maybe log a warning if these offsets exceed the current
        # offsets, since that's probably a side effect of an incorrect usage
        # pattern?
        self.__staged_offsets.update(offsets)

    def __commit(self) -> Mapping[Partition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        result: Optional[Sequence[ConfluentTopicPartition]]

        if self.__staged_offsets:
            result = self.__consumer.commit(
                offsets=[
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset)
                    for partition, offset in self.__staged_offsets.items()
                ],
                asynchronous=False,
            )
        else:
            result = []

        assert result is not None  # synchronous commit should return result immediately

        self.__staged_offsets.clear()

        offsets: MutableMapping[Partition, int] = {}

        for value in result:
            # The Confluent Kafka Consumer will include logical offsets in the
            # sequence of ``Partition`` objects returned by ``commit``. These
            # are an implementation detail of the Kafka Consumer, so we don't
            # expose them here.
            # NOTE: These should no longer be seen now that we are forcing
            # offsets to be set as part of the assignment callback.
            if value.offset in self.LOGICAL_OFFSETS:
                continue

            assert value.offset >= 0, "expected non-negative offset"
            offsets[Partition(Topic(value.topic),
                              value.partition)] = value.offset

        return offsets

    def commit_offsets(self) -> Mapping[Partition, int]:
        """
        Commit staged offsets for all partitions that this consumer is
        assigned to. The return value of this method is a mapping of
        partitions with their committed offsets as values.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        return self.__commit_retry_policy.call(self.__commit)

    def close(self, timeout: Optional[float] = None) -> None:
        """
        Close the consumer. This stops consuming messages, *may* commit
        staged offsets (depending on the configuration), and ends its
        subscription.

        Raises a ``InvalidState`` if the consumer is unable to be closed
        before the timeout is reached.
        """
        try:
            self.__consumer.close()
        except RuntimeError:
            pass

        self.__state = KafkaConsumerState.CLOSED

    @property
    def closed(self) -> bool:
        return self.__state is KafkaConsumerState.CLOSED
Exemplo n.º 11
0
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({
        'group.id': 'test',
        'socket.timeout.ms': '100',
        'session.timeout.ms': 1000,  # Avoid close() blocking too long
        'on_commit': dummy_commit_cb
    })

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"],
                 on_assign=dummy_assign_revoke,
                 on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    partitions = list(
        map(lambda p: TopicPartition("test", p), range(0, 100, 3)))
    kc.assign(partitions)

    kc.unassign()

    kc.commit(async=True)

    try:
        kc.commit(async=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == -1001]) == len(partitions)

    try:
        offsets = kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    kc.close()
Exemplo n.º 12
0
# 获取一个 partition 的最小、最大 offset
consumer.get_watermark_offsets(TopicPartition('test', 4))
# (0, 19)

# 如果是一个新的 group.id 必须先消费一条消息,这样后面的重置 offset 才有效, 如果不消费,重置 offset 前后获取到的 offset 值都是-1001
# 获取当前 offset 位置
consumer.position([TopicPartition('test', 3)])

# 重置 offset 到任意位置,committed 决定了下一次连接后的 offset 位置(以 group 为维度),本次连接无效。本次连接的 offset 位置由 position 决定。
# 重置 offset 后,要 close 重新连才有效。position 决定本次连接的 offset 位置,用 seek() 修改。
consumer.seek(TopicPartition('test', 3, 1))
consumer.commit(offsets=[TopicPartition('test', 3, 7)])

# 检查重置的位置
msg = consumer.committed([TopicPartition('test', 3)])
print(msg)

# offset:Either an absolute offset (>=0) or a logical offset: OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID
while True:
    msg = consumer.poll(3.0)
    if msg is None:
        continue
    if msg.error():
        if msg.error().code() == KafkaError._PARTITION_EOF:
            continue
        else:
            print(msg.error())
            break
    print('Received message: {}'.format(msg.value().decode('utf-8')))
Exemplo n.º 13
0
config = {
    'bootstrap.servers': 'localhost',
    'group.id': 'my-group2',
    'enable.auto.commit': True,
    'default.topic.config': {
        'auto.offset.reset': 'smallest'
    }
}
consumer = Consumer(config)
tp = TopicPartition(topic, 0)

consumer.subscribe([topic])

_, offset_max = consumer.get_watermark_offsets(tp)
offset_min = consumer.committed([tp])[0].offset

print(offset_min, offset_max)
sleep(2)

number = offset_max - max(offset_min, 0)

print(f"Debería leer un total de {number} mensajes")

messages = consumer.consume(num_messages=number, timeout=10)

if messages is None:
    raise ValueError('No he podido leer nada')

print("Hay un total de " + str(len(messages)) + " mensajes.")
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100',
                   'session.timeout.ms': 1000,  # Avoid close() blocking too long
                   'on_commit': dummy_commit_cb})

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    kc.close()
Exemplo n.º 15
0
Arquivo: kafka.py Projeto: Appva/snuba
class KafkaConsumer(Consumer[TopicPartition, int, bytes]):
    """
    The behavior of this consumer differs slightly from the Confluent
    consumer during rebalancing operations. Whenever a partition is assigned
    to this consumer, offsets are *always* automatically reset to the
    committed offset for that partition (or if no offsets have been committed
    for that partition, the offset is reset in accordance with the
    ``auto.offset.reset`` configuration value.) This causes partitions that
    are maintained across a rebalance to have the same offset management
    behavior as a partition that is moved from one consumer to another. To
    prevent uncommitted messages from being consumed multiple times,
    ``commit`` should be called in the partition revocation callback.

    The behavior of ``auto.offset.reset`` also differs slightly from the
    Confluent consumer as well: offsets are only reset during initial
    assignment or subsequent rebalancing operations. Any other circumstances
    that would otherwise lead to preemptive offset reset (e.g. the consumer
    tries to read a message that is before the earliest offset, or the
    consumer attempts to read a message that is after the latest offset) will
    cause an exception to be thrown, rather than resetting the offset, as
    this could lead to chunks messages being replayed or skipped, depending
    on the circumstances. This also means that if the committed offset is no
    longer available (such as when reading older messages from the log and
    those messages expire, or reading newer messages from the log and the
    leader crashes and partition ownership fails over to an out-of-date
    replica), the consumer will fail-stop rather than reset to the value of
    ``auto.offset.reset``.
    """

    # Set of logical offsets that do not correspond to actual log positions.
    # These offsets should be considered an implementation detail of the Kafka
    # consumer and not used publically.
    # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25
    LOGICAL_OFFSETS = frozenset(
        [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID])

    def __init__(self, configuration: Mapping[str, Any]) -> None:
        auto_offset_reset = configuration.get("auto.offset.reset", "largest")
        if auto_offset_reset in {"smallest", "earliest", "beginning"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_earliest)
        elif auto_offset_reset in {"largest", "latest", "end"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_latest)
        elif auto_offset_reset == "error":
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_error)
        else:
            raise ValueError(
                "invalid value for 'auto.offset.reset' configuration")

        # NOTE: Offsets are explicitly managed as part of the assignment
        # callback, so preemptively resetting offsets is not enabled.
        self.__consumer = ConfluentConsumer({
            **configuration, "auto.offset.reset":
            "error"
        })

        self.__offsets: MutableMapping[TopicPartition, int] = {}

        self.__state = KafkaConsumerState.CONSUMING

    def __resolve_partition_offset_earliest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       low)

    def __resolve_partition_offset_latest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       high)

    def __resolve_partition_offset_error(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        raise ConsumerError("unable to resolve partition offsets")

    def subscribe(
        self,
        topics: Sequence[str],
        on_assign: Optional[Callable[[Sequence[TopicPartition]], None]] = None,
        on_revoke: Optional[Callable[[Sequence[TopicPartition]], None]] = None,
    ) -> None:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        def assignment_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.ASSIGNING

            try:
                assignment: MutableSequence[ConfluentTopicPartition] = []

                for partition in self.__consumer.committed(partitions):
                    if partition.offset >= 0:
                        assignment.append(partition)
                    elif partition.offset == OFFSET_INVALID:
                        assignment.append(
                            self.__resolve_partition_starting_offset(
                                partition))
                    else:
                        raise ValueError("received unexpected offset")

                offsets: MutableMapping[TopicPartition, int] = {
                    TopicPartition(i.topic, i.partition): i.offset
                    for i in assignment
                }
                self.__seek(offsets)
            except Exception:
                self.__state = KafkaConsumerState.ERROR
                raise

            try:
                if on_assign is not None:
                    on_assign(list(offsets.keys()))
            finally:
                self.__state = KafkaConsumerState.CONSUMING

        def revocation_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.REVOKING

            streams = [
                TopicPartition(i.topic, i.partition) for i in partitions
            ]

            try:
                if on_revoke is not None:
                    on_revoke(streams)
            finally:
                for stream in streams:
                    try:
                        self.__offsets.pop(stream)
                    except KeyError:
                        # If there was an error during assignment, this stream
                        # may have never been added to the offsets mapping.
                        logger.warning(
                            "failed to delete offset for unknown stream: %r",
                            stream)

                self.__state = KafkaConsumerState.CONSUMING

        self.__consumer.subscribe(topics,
                                  on_assign=assignment_callback,
                                  on_revoke=revocation_callback)

    def unsubscribe(self) -> None:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        self.__consumer.unsubscribe()

    def poll(self, timeout: Optional[float] = None) -> Optional[KafkaMessage]:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        message: Optional[ConfluentMessage] = self.__consumer.poll(
            *[timeout] if timeout is not None else [])
        if message is None:
            return None

        error: Optional[KafkaError] = message.error()
        if error is not None:
            code = error.code()
            if code == KafkaError._PARTITION_EOF:
                raise EndOfStream(
                    TopicPartition(message.topic(), message.partition()),
                    message.offset(),
                )
            elif code == KafkaError._TRANSPORT:
                raise TransportError(str(error))
            else:
                raise ConsumerError(str(error))

        result = KafkaMessage(
            TopicPartition(message.topic(), message.partition()),
            message.offset(),
            message.value(),
        )

        self.__offsets[result.stream] = result.get_next_offset()

        return result

    def tell(self) -> Mapping[TopicPartition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return self.__offsets

    def __seek(self, offsets: Mapping[TopicPartition, int]) -> None:
        if self.__state is KafkaConsumerState.ASSIGNING:
            # Calling ``seek`` on the Confluent consumer from an assignment
            # callback will throw an "Erroneous state" error. Instead,
            # partition offsets have to be initialized by calling ``assign``.
            self.__consumer.assign([
                ConfluentTopicPartition(stream.topic, stream.partition, offset)
                for stream, offset in offsets.items()
            ])
        else:
            for stream, offset in offsets.items():
                self.__consumer.seek(
                    ConfluentTopicPartition(stream.topic, stream.partition,
                                            offset))

        self.__offsets.update(offsets)

    def seek(self, offsets: Mapping[TopicPartition, int]) -> None:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError("cannot seek on unassigned streams")

        self.__seek(offsets)

    def commit(self) -> Mapping[TopicPartition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        result: Optional[Sequence[ConfluentTopicPartition]] = None

        retries_remaining = 3
        while result is None:
            try:
                result = self.__consumer.commit(asynchronous=False)
                assert result is not None
            except KafkaException as e:
                if not e.args[0].code() in (
                        KafkaError.REQUEST_TIMED_OUT,
                        KafkaError.NOT_COORDINATOR_FOR_GROUP,
                        KafkaError._WAIT_COORD,
                ):
                    raise

                if not retries_remaining:
                    raise

                logger.warning(
                    "Commit failed: %s (%d retries remaining)",
                    str(e),
                    retries_remaining,
                )
                retries_remaining -= 1
                time.sleep(1)

        offsets: MutableMapping[TopicPartition, int] = {}

        for value in result:
            # The Confluent Kafka Consumer will include logical offsets in the
            # sequence of ``TopicPartition`` objects returned by ``commit``.
            # These are an implementation detail of the Kafka Consumer, so we
            # don't expose them here.
            # NOTE: These should no longer be seen now that we are forcing
            # offsets to be set as part of the assignment callback.
            if value.offset in self.LOGICAL_OFFSETS:
                continue

            assert value.offset >= 0, "expected non-negative offset"
            offsets[TopicPartition(value.topic,
                                   value.partition)] = value.offset

        return offsets

    def close(self, timeout: Optional[float] = None) -> None:
        try:
            self.__consumer.close()
        except RuntimeError:
            pass

        self.__state = KafkaConsumerState.CLOSED
Exemplo n.º 16
0
class Base_Consumer:
    def __init__(self, topic: str, bootstrap_server: str, sess_timeout: int,
                 retries: int, group_id: str, assign: bool) -> None:
        """
        Config Consumer properties, 
        Args:
            topic (str): topic of meassage
            bootstrap_server (str): broker connection host:port
            sess_timeout (int): detect failures when using Kafka’s group management facilities
            retries (int): retry for error and exception
            group (str): consumer group id 
            partition (int, optional): Defaults to 0.
            offset (int, optional): message next ready to read position. Defaults to 0.
        """
        self.topic = topic
        self.need_assign_ = assign
        self.consumer = Consumer({
            "bootstrap.servers": bootstrap_server,
            "group.id": group_id,
            "default.topic.config": {
                "auto.offset.reset": "earliest",
                "acks": 1
            },  #EOS
            "api.version.request": True,
            "session.timeout.ms": sess_timeout,  #heartbeat
            "max.poll.interval.ms": 20000,  #processing thread
            "enable.auto.commit": False,
            "auto.commit.interval.ms": 10000,
            "enable.auto.offset.store": True,
            'topic.metadata.refresh.interval.ms': 20000,
            "partition.assignment.strategy": "range",  #default
            "retries": retries,
            "debug": "all"
        })

    def get_partitions_(self, partition_id: int) -> dict:
        part = TopicPartition(self.topic, partition_id)
        partitions = self.consumer.committed([part])
        pprint(f"Current Partition: {partition_id} - {partitions}")
        return partitions

    def get_topics(self) -> str:
        return self.consumer.list_topics(self.topic)

    # @staticmethod
    def on_assign(self, consumer, partitions: List[int]) -> Text:
        for p in partitions:
            p.offset = 100
        pprint(f"Assign: {partitions}")
        consumer.assign(partitions)

    async def consume(self):
        """Asynchronously consuming"""
        while True:
            results = 1
            while results > 0:
                results = self.receive_msgs()
            await sleep(1)

    def receive_msgs(self, func_assign: Callable) -> Union[Text, pd.DataFrame]:
        running = True
        c = self.consumer
        if self.need_assign_:
            try:
                c.subscribe(self.topic, on_assign=func_assign)
            except KafkaException as e:
                pprint(e)
        else:
            try:
                c.subscribe(self.topic)
            except Exception as e:
                pprint(e)

        message_values = list()
        offsets = list()
        keys = list()
        partitions = list()
        try:
            while running:
                msg = c.poll(10)
                if msg is None:
                    continue
                if msg.error():
                    print("Consumer error: {}".format(msg.error()))
                    continue
                # ==== processing ====
                payload_ = msg.value().decode("utf-8")
                key_ = msg.key().decode("utf-8")
                partition_ = msg.partition()
                offset_ = msg.offset()
                pprint(f"Receive messages: {payload_}: {offset_}")

                message_values.append(payload_)
                keys.append(key_)
                partitions.append(partition_)
                offsets.append(offset_)

        except Exception as e:
            pprint(f"Error: {str(e)}")
        except:
            running = False
            print("Error pooling messages and exit...")
        finally:
            return pd.DataFrame({
                "keys":
                keys,
                "lon_val": [v.split("\t-")[0] for v in message_values],
                "lat_val": [v.split("\t-")[1] for v in message_values],
                "partitions":
                partitions,
                "offsets":
                offsets
            })
            c.close()
Exemplo n.º 17
0
def replicate(topic, rerun, delete, source, src_groupid, target, trg_groupid,
              trg_partitions):
    global source_partitions

    # Connect to source kafka cluster
    src = Consumer({
        'bootstrap.servers': source,
        'group.id': src_groupid,
        'auto.offset.reset': 'smallest',
        'enable.auto.commit': False
    })

    # Connect to target kafka cluster
    trg = Consumer({
        'bootstrap.servers': target,
        'group.id': trg_groupid,
    })

    admin_client = KafkaAdminClient(bootstrap_servers=TRG_BOOTSTRAP_SERVERS,
                                    client_id=TRG_GROUP_ID)

    if delete:
        logger.warning(
            f"DELETING topic {topic} on {TRG_BOOTSTRAP_SERVERS} as requested")
        admin_client.delete_topics([topic])
        logger.warning(f"DELETION of {topic} completed.")

    logger.info(f"source cluster: {source}  source group_id: {src_groupid}")
    logger.info(f"target cluster: {target}  target group_id: {trg_groupid}")

    # Determine if latest source topic is at least partially loaded to target
    trg_topics, the_topic, offset_sum_delta = determine_topic(
        topic, src, trg, rerun)

    src_cm = src.list_topics()  # returns ClusterMetadata
    if the_topic not in src_cm.topics:
        logger.error(
            f"Current topics in {source} with group id {src_groupid} are:")
        logger.error(f"{src_cm.topics}")
        logger.error(
            f"Topic {topic} not in cluster {source} with group id {src_groupid}"
        )
        sys.exit(1)

    src_partition_count = len(src_cm.topics[the_topic].partitions)

    logger.info(
        f"topic: {the_topic} has # of partitions: {src_partition_count}")
    # Calculate multiplier for demuxing
    # Example:
    #    source = 4 target = 9 then multiplier is 9/4=2.25
    #    int(2.25) = 2
    multiplier = int(trg_partitions / src_partition_count)
    trg_partition_count = src_partition_count * multiplier
    logger.info(
        f"multiplier={multiplier} target_partition_count={trg_partition_count}"
    )

    # Add the new topic in target cluster
    if the_topic not in trg_topics:
        logger.info(
            f"replicate {the_topic} to {TRG_BOOTSTRAP_SERVERS} with source group id: {src_groupid}"
        )

        topic_list = [
            NewTopic(name=the_topic,
                     num_partitions=trg_partition_count,
                     replication_factor=1)
        ]
        try:
            logger.info(
                f"Creating topic {the_topic} with {trg_partition_count} partitions"
            )
            admin_client.create_topics(new_topics=topic_list,
                                       validate_only=False)
        except kafka.errors.TopicAlreadyExistsError:
            logger.info(f"Topic already exists in {TRG_BOOTSTRAP_SERVERS} ")
    part_map = create_part_map(src_partition_count, multiplier)

    # Get offset status for each partition
    logger.info(f"Source broker partitions for topic {the_topic}")
    logger.info(
        "-------------------------------------------------------------------------"
    )
    parts = {}
    total_committed = 0
    total_offsets = 0

    for part in src_cm.topics[the_topic].partitions:
        tp = TopicPartition(the_topic, part)
        tp.offset = confluent_kafka.OFFSET_BEGINNING
        src.assign([tp])
        any_committed = src.committed([tp])
        committed = any_committed[0].offset
        total_committed += committed
        end_offset = src.get_watermark_offsets(tp, cached=False)[1]
        position = src.position([tp])[0].offset
        if position == confluent_kafka.OFFSET_BEGINNING:
            position = 0
        elif position == confluent_kafka.OFFSET_END:
            position = end_offset
        elif position == confluent_kafka.OFFSET_INVALID:
            position = 0

        parts[str(part)] = end_offset
        total_offsets += end_offset
        logger.info(
            "Source topic: %s partition: %s end offset: %s committed: %s position: %s lag: %s"
            % (the_topic, part, end_offset, committed, position,
               (position - committed)))

    src.close()
    logger.info(
        f"Source: total_committed={total_committed} total_offsets={total_offsets}"
    )
    logger.info(
        "========================================================================="
    )

    logger.info(
        f"Starting multi-process: the_topic={the_topic} rerun={rerun} src_partition_count={src_partition_count}"
    )
    procs = [
        mp.Process(target=proc_replicate,
                   args=(the_topic, part, parts[str(part)], part_map, rerun))
        for part in range(0, src_partition_count)
    ]

    for proc in procs:
        proc.start()
    for proc in procs:
        proc.join()

    logger.info(f"END")
Exemplo n.º 18
0
class OffsetTranslator():
    """Translates consumer group offsets as part of a migration to a new cluster.
  Given a consumer group, source and destination cluster, it will find the topics
  involved in the consumer group and the committed offsets.
  For there it uses OffsetsForTimes() to find the offset for a message with an equal
  or greater time in the destination cluster and compares a hash of the message value
  to confirm if the offset relates to the same message. If not, it advances the timestamp
  by one millisecond and finds the next offset - this becomes the range of offsets it
  will traverse over to find a matching hash.
  If there were no more recent timestamps on the topic partition, it will call
  getWatermarkOffsets() to get the last offset and traverse accordingly.
  If the number of messages to traverse is stupidly large (currently set at 500) it throws
  a warning.
  There is every possibility that the message simply doesn't exist, in which case 
  it will throw an exception.
  """
    def __init__(self, src_bootstrap_server, src_group_id, src_topic,
                 dest_bootstrap_server, dest_group_id):

        self._admin = AdminClient({"bootstrap.servers": src_bootstrap_server})

        # For reading offsets/messages in the source cluster
        self._consumer = Consumer({
            "bootstrap.servers": src_bootstrap_server,
            "group.id": src_group_id,
            "enable.auto.commit": "false"
        })

        # For reading offsets/messages in the destination cluster
        self._dest_consumer = Consumer({
            "bootstrap.servers": dest_bootstrap_server,
            "group.id": dest_group_id,
            "enable.auto.commit": "false"
        })

        # Handy instance variables
        self._src_group_id = src_group_id
        self._src_topic = src_topic
        self._src_bootstrap_servers = src_bootstrap_server
        self._dest_group_id = dest_group_id
        self._dest_bootstrap_servers = dest_bootstrap_server
        self._metadata = defaultdict(dict)

        self.logger = logging.getLogger('translator')
        self.logger.info("Offset Translator object instantiated.")
        self.logger.info(
            f"  Source bootstrap servers: {self._src_bootstrap_servers}")
        self.logger.info(
            f"  Destination  bootstrap servers: {self._src_bootstrap_servers}")
        self.logger.info(f"  Consumer group: {self._src_group_id}")

    def metadataKeyFromTPO(self, tpo):
        """Return a string key from TopicPartition object for use in metadata hash
    """
        return f"{tpo.topic}::{tpo.partition}"

    def buildMetadataMap(self, tpos):
        """Use TopicPartition data to build internal metadata hash for comparing offsets, timestamps etc between source
       and destination clusters.
    """

        self.logger.info(f"Building metadata map...")

        for tpo in tpos:
            key = self.metadataKeyFromTPO(tpo)
            self._metadata[key] = {
                "src_offset": tpo.offset,
                "src_timestamp": 0,
                "src_hash": None,
                "src_tpo": tpo,
                "src_message": None,
                "dest_offset": None,
                "dest_timestamp": None,
                "dest_hash": None,
                "dest_tpo": None,
                "dest_message": None
            }

        self.logger.info(f"Built metadata for {len(tpos)} TPOs")
        return self._metadata

    def getTPOs(self, topics):
        """Use the AdminAPI to return a list of TopicParition objects for a list of topics
    """

        self.logger.info(
            f"Getting TPOs for {len(topics)} topics via admin API...")
        tpos = []
        for t in topics:
            for p in self._admin.list_topics(t).topics[t].partitions:
                tpos.append(TopicPartition(t, p))

        self.logger.info(f"Found {len(tpos)} TPOs for {len(topics)} topics.")
        return tpos

    def updateMetadata(self, metadata):
        """Takes output of inspectTPOMessages() and updates metadata.
    We don't do this automatically within inspectTPOMessagse, as we may want 
    to use inspectTPOMessages on the destination cluster and compare to the 
    source, so updating the object's metadata would render that useless.
    """

        self.logger.info("Updating metadata...")
        for key in metadata.keys():
            for inner_key in metadata[key]:
                self._metadata[key][inner_key] = metadata[key][inner_key]

        # Grab the first key and check if it relates to src_ or dest_ data..
        sample = metadata[next(iter(metadata.keys()))]
        if 'src_offset' in sample.keys():
            cluster = "source"
        elif 'dest_offset' in sample.keys():
            cluster = "destination"
        else:
            raise Exception(
                "Metadata doesn't clearly indicate which cluster it is from.. no src_offset or dest_offset key present..."
            )

        self.logger.info(
            f"{len(metadata)} updates to metadata from {cluster} cluster.")
        return self._metadata

    def inspectTPOMessages(self, tpos, cluster="source"):
        """ Given a list of TopicPartition objects, for each partition read the message at the
      required offset and extract the timestamp, hash the message value
      """

        self.logger.info(f"Inspecting {len(tpos)} TPOs in {cluster} cluster.")

        # Default to the source cluster consumer; we will also use this
        # to inspect destination cluster messages
        if cluster == "source":
            consumer = self._consumer
        elif cluster == "destination":
            consumer = self._dest_consumer
        else:
            raise Exception(
                "cluster argument to inspectTPOMessages must be one of 'source' or 'destination'"
            )

        circuit_breaker_retry_count = 0

        metadata = defaultdict(dict)

        # This seems a slow way to just read one message at a time from a partition, but I'm not aware
        # of a better way of reading a single message for each partition when there may be further messages
        # on the partition.
        for tpo in tpos:

            # If the tpo.offset is < 0, then the consumer hasn't read anything
            # from the topic partition, so skip it.
            if tpo.offset < 0:
                continue

            consumer.assign([tpo])

            while True:
                # Poll for data on this specific TopicPartition
                m = consumer.poll(1)
                if m is None:
                    circuit_breaker_retry_count += 1
                    if circuit_breaker_retry_count > 10:
                        print(
                            "Too many iterations polling for data and getting nothing."
                        )
                        break
                    else:
                        continue
                elif m.error() is None:
                    # We'll build a local copy of metadata
                    md = {}
                    if cluster == "source":
                        md['src_offset'] = m.offset()
                        md['src_timestamp'] = m.timestamp()[1]
                        md['src_hash'] = self.sha256Object(m.value())
                        md['src_tpo'] = tpo
                        md['src_message'] = m
                    elif cluster == "destination":
                        md['dest_offset'] = m.offset()
                        md['dest_timestamp'] = m.timestamp()[1]
                        md['dest_hash'] = self.sha256Object(m.value())
                        md['dest_tpo'] = tpo
                        md['dest_message'] = m

                    key = self.metadataKeyFromTPO(tpo)
                    metadata[key] = md
                    circruit_breaker_retry_count = 0

                    # Break the while loop, we've got our data for this topic/partition
                    break
                else:
                    raise Exception(
                        f"Error reading offset {tpo.offset} from {tpo.topic}/{tpo.partition}: {m.error()}"
                    )

        self.logger.info(f"Returning metadata for {len(metadata)} TPOs")
        return metadata

    def sha256Object(self, obj):
        """Return the sha256 digest for a supplied object"""
        return hashlib.sha256(bytes(obj)).hexdigest()

    def getTPOsByTime(self, metadata=None):
        """ Build a list of TopicPartitions using message timestamps instead of offsets
    """

        if metadata is None:
            metadata = self._metadata

        self.logger.info(
            f"Getting offsets from timestamps for {len(metadata)} metadata entries.."
        )

        tpos_by_time = list()
        for key in metadata.keys():
            md = self._metadata[key]
            if md['src_timestamp'] > 0:
                tpo = md['src_tpo']
                tpos_by_time.append(
                    TopicPartition(tpo.topic, tpo.partition,
                                   md['src_timestamp']))

        # This returns the earliest offset for a given timestamp
        tpos = self._dest_consumer.offsets_for_times(tpos_by_time)

        # Check for errors
        for t in [t for t in tpos if t.error is not None]:
            raise Exception(
                f"Error getting offset from timestamp: Topic {t.topic}, Partition {t.partition}, Offset {t.offset}: Error {t.error}"
            )

        self.logger.info(
            f"Returning {len(tpos)} offsets from destination cluster.")
        return tpos

    def findMatchingMessages(self):
        """Iterate over metadata and find matching source/destination messages and
    separate into matched / unmatched buckets, returning a tuple
    """

        self.logger.info(
            "Searching for destination messages that match via message hash..."
        )

        # Iterate over the source cluster metadata and compare to destination cluster
        translated_offsets = list()
        unmatched_offsets = list()

        for key in self._metadata.keys():
            metadata = self._metadata[key]
            src_tpo = metadata['src_tpo']
            dest_message = metadata['dest_message']
            dest_timestamp = metadata['dest_timestamp']
            dest_tpo = metadata['dest_tpo']

            self.logger.info(
                f"  Working with TopicPartition({src_tpo.topic},{src_tpo.partition},{src_tpo.offset}) @ {metadata['src_timestamp']}"
            )

            # We found the destination cluster message by offsets_for_times and compared hashes
            # If they match, then the destination offset
            if metadata['src_hash'] == metadata['dest_hash']:
                self.logger.info(
                    f"   FOUND:      TopicPartition({dest_tpo.topic},{dest_tpo.partition},{dest_tpo.offset}) @ {dest_timestamp} in destination cluster"
                )
                self._metadata[key]['matched'] = True
                translated_offsets.append(dest_tpo)
            else:
                self.logger.info(
                    f"   NOT FOUND:  TopicPartition({dest_tpo.topic},{dest_tpo.partition},{dest_tpo.offset}) @ {dest_timestamp} does not have same hash."
                )
                self.logger.info(
                    f"   will traverse messages and attempt to find a match.")
                self._metadata[key]['matched'] = False
                unmatched_offsets.append(metadata)

        self.logger.info(
            f"Found {len(translated_offsets)} matching offsets and {len(unmatched_offsets)} that don't match."
        )
        return (translated_offsets, unmatched_offsets)

    def findOffsetRangeToScan(self, md):
        """Using a metadata record as a base, identify how many records (maximum) to scan through to find a match

    We are here because we didn't find a match for source cluster timestamp, which means it is either not there, or 
    multiple messages were produced during that millisecond and our offsets_for_times() call provided the lowest offset 
    for that millisecond.  We will add 1 ms to the timestamp and get the offset (if possible) and then iterate over 
    each message and compare hashes to determine what the exact offset should be.
    """

        self.logger.info(
            "Find the start/end offsets to iterate over to find a match based on message value hash."
        )

        timestamp_end = md['src_timestamp'] + 1  # add one millisecond
        tpo = md['dest_tpo']
        starting_offset = md['dest_offset']

        end_offset = self._dest_consumer.offsets_for_times(
            [TopicPartition(tpo.topic, tpo.partition, timestamp_end)])
        self.logger.info(
            f"Shifting timestamp by 1ms, from {md['src_timestamp']} to {timestamp_end}"
        )
        self.logger.info(
            f"                           yields an offset of {end_offset[0]}")

        target_offset = -1
        if end_offset[0].offset == -1:
            # There are no more recent timestamps for the topic/partition
            # Set the ending offset at the end of partition
            low, high = self._dest_consumer.get_watermark_offsets(
                TopicPartition(tpo.topic, tpo.partition))
            target_offset = high
            self.logger.info(
                f"Reading to end of the partition... {target_offset}")
            if target_offset - tpo.offset > 500:
                self.logger.warning(
                    f"    Note: that involves reading and hashing {target_offset - tpo.offet} messages.. might take some time."
                )
        else:
            # There was a more recent timestamped message, so we'll use that as our target offset
            target_offset = end_offset[0].offset

        self.logger.info(
            f"Starting offset for scan is {starting_offset} (inclusive)")
        self.logger.info(
            f"Ending   offset for scan is {target_offset} (exclusive)")

        return (starting_offset, target_offset)

    def compareOffsets(self):
        """For the list of tpos in the source cluster, look them up in the destination
    and compare value hashes; if they match all good; if not, iterate over records
    until a match is found (where duration is one millisecond, based on the 
    assumption that multiple messages have been produced during the same millisecond)
    """

        self.logger.info(
            "Comparing offsets between source and destination cluster...")

        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self._metadata)

        # Check that we have destination cluster offsets and hashes before proceeding - if not, we
        # have incomplete data and should explode into a ball of flames to the sound of a distorted
        # sitar being played backwards.
        counter = 0
        for k in self._metadata.keys():
            if self._metadata[k]['dest_hash'] is None or    \
               self._metadata[k]['dest_offset'] is None or  \
               self._metadata[k]['src_hash'] is None:
                counter += 1

        if counter > 0:
            raise Exception(
                f"{counter} out of {len(self._metadata)} topic partitions have insufficient data. Exiting."
            )

        translated_offsets, unmatched_offsets = self.findMatchingMessages()

        self.logger.info("Working on unmatched offsets...")

        messages_found = 0
        for md in unmatched_offsets:
            tpo = md['dest_tpo']
            (starting_offset, target_offset) = self.findOffsetRangeToScan(md)

            for offset in range(starting_offset, target_offset):
                self.logger.info(
                    f"Inspecting destination cluster message at offset {offset}..."
                )
                results = self.inspectTPOMessages(
                    [TopicPartition(tpo.topic, tpo.partition, offset)],
                    cluster="destination")
                if len(results) == 0:
                    raise Exception(
                        "Didn't get any metadata from call to inspectTPOMessages(). This implies we read data from the source cluster, but couldn't inspect any messages in the destination cluster. Stopping."
                    )
                elif len(results) > 1:
                    raise Exception(
                        f"Expecting only one result from call to inspectTPOMessages, but got {len(results)}. Stopping"
                    )
                else:
                    # Get the (only) key from the dict
                    key = next(iter(results))
                    dest_hash = results[key]['dest_hash']
                    dest_tpo = results[key]['dest_tpo']
                    dest_message = results[key]['dest_message']

                    if dest_hash == md['src_hash']:
                        self.logger.info("   FOUND matching record: ")
                        self.logger.info(
                            f"                         source hash was {md['src_hash']}, and"
                        )
                        self.logger.info(
                            f"                         dest_hash is    {dest_hash}"
                        )
                        self.logger.info(
                            f".                        destination     {dest_tpo}"
                        )
                        self._metadata[key]['matched'] = True

                        # Update our metadata to accurately reflect the correct destination message
                        self._metadata[key][
                            'dest_offset'] = dest_message.offset()
                        self._metadata[key]['dest_hash'] = dest_hash
                        self._metadata[key][
                            'dest_timestamp'] = dest_message.timestamp()[1]
                        self._metadata[key]['dest_tpo'] = dest_tpo
                        self._metadata[key]['dest_message'] = dest_message

                        translated_offsets.append(dest_tpo)
                        messages_found += 1

                        # Found it so stop iterating
                        break

        self.logger.info(
            f"Found {messages_found} out of {len(unmatched_offsets)} unmatched objects."
        )
        # Sort the offset map by partition number, which may have become out of
        # order if we needed to read and hash messages to find a hash match
        return sorted(translated_offsets, key=lambda k: k.partition)

    def getMetadata(self):
        """Return our offset metadata object"""
        return self._metadata

    def getMessage(self, consumer, tpo):
        """Read a message at a tpo, return it"""
        consumer.assign([tpo])
        res = consumer.consume(num_messages=1, timeout=3)
        if len(res) == 1:
            return res[0]
        else:
            return None

    def commitTranslatedOffsets(self, tpos):
        """Given a list of TopicPartition objects, set the consumer group offsets"""

        self.logger.info("Committing offsets for supplied TPOs...")

        # Our offsets have been the last message consumed; need to set all offsets to +1
        # so that they represent the next message to consume.
        for t in tpos:
            t.offset += 1

        self.logger.info(
            " TPO offsets are incremented by one so that next message consumed is correct."
        )

        errored_commits = list()
        retries = 3
        while retries > 0:
            self.logger.info(
                f" Calling commit() for {len(tpos)} topic/partitions to destination cluster."
            )
            committed = self._dest_consumer.commit(offsets=tpos,
                                                   asynchronous=False)

            for t in [t for t in committed if t.error is not None]:
                errored_commits.append(t)

            if len(errored_commits) > 0:
                self.logger.warning(f"  Errors commiting offsets:")
                for t in errored_commits:
                    self.logger.info(
                        f"       Partition({t.partition}), Offset({t.offset}): {t.error}"
                    )
                self.logger.info(f"  Trying again in 2 seconds...")
                time.sleep(2)
                tpos = errored_commits
                errored_commits = list()
                retries -= 1
            else:
                self.logger.info(
                    "Offsets committed successfully to destination cluster")
                errored_commits.clear()
                break

            if len(errored_commits) > 0:
                self.logger.warning("Still had errors after 3 tries:")
                for t in errored_commits:
                    self.logger.info(
                        f"     Partition({t.partition}), Offset({t.offset}): {t.error}"
                    )
                self.logger.info("Returning with a job not finished!!")

        return committed

    def printMetadata(self, metadata=None):
        if metadata is None:
            metadata = self._metadata

        #print("================================================================================")
        #print("================================================================================")
        #print("================================================================================")
        #pp = pprint.PrettyPrinter(indent=4)
        #pp.pprint(metadata)
        #print("================================================================================")
        #print("================================================================================")
        #print("================================================================================")

        topic = None
        for key in metadata.keys():
            md = metadata[key]
            tpo = md['src_tpo']

            if tpo.topic != topic:
                topic = tpo.topic
                self.logger.info(f"topic: {tpo.topic}:")

            src_offset = md['src_offset']
            src_timestamp = md['src_timestamp']
            src_hash = md['src_hash']

            # We might be passed a metadata object that doesn't set dest_* fields
            if 'dest_tpo' in md:
                if md['dest_tpo'] is not None:
                    dest_offset = md['dest_tpo'].offset
                else:
                    dest_offset = ''
            else:
                dest_offset = ''

            if 'dest_message' in md:
                if md['dest_message'] is not None:
                    dest_timestamp = md['dest_message'].timestamp()[1]
                else:
                    dest_timestamp = ''
            else:
                dest_timestamp = ''

            if 'dest_hash' in md:
                dest_hash = md['dest_hash']
            else:
                dest_hash = ''

            self.logger.info(f"  p[{tpo.partition:1}]")
            self.logger.info(
                f"     source       last message offset ({src_offset:1}), timestamp({src_timestamp:12}), hash({src_hash})"
            )
            self.logger.info(
                f"     destination  last message offset ({dest_offset:1}), timestamp({dest_timestamp:12}), hash({dest_hash})"
            )

            #if 'src_message' in md and md['src_message'] is not None:
            #  pp.pprint(str(md['src_message'].value(),'utf-8'))
            #if 'dest_message' in md and md['dest_message'] is not None:
            #  pp.pprint(str(md['dest_message'].value(),'utf-8'))
            #print("<<<<<< DONE")

    def getConsumerGroupOffsets(self, topics):
        """Return the latest offset for the consumer group defined at
    object initialisation time.
    Moves offset by -1 so that we can re-read the last message consumed.
    """

        self.logger.info(
            f"Getting consumer group offsets for {len(topics)} topics...")

        tpos = self.getTPOs(topics)
        tpos = self._consumer.committed(tpos)

        self.logger.info(
            "  Decrementing offsets so that we can inspect the last message consumed (for hashing, timestamps, etc)"
        )
        # Wind back one offset so that we can re-read the messages
        for t in tpos:
            t.offset -= 1

        self.logger.info(f"Found offsets for {len(tpos)} topic partitions.")
        return tpos

    def allOffsetsMatched(self):
        """Test that all metadata has a matched == True value """
        self.logger.info(
            "Checking that all metadata records were matched in the destination cluster..."
        )
        for md in self._metadata:
            if self._metadata[md]['matched'] == False:
                self.logger.info("Unmatched metadata records found.")
                return False
        self.logger.info("All metadata was matched.")
        return True

    def findTopicsForConsumerGroup(self, cg=None):
        """Given a consumer group name, Find the topics associated with the consumer group.
    We use the shell because the confluent_kafka_python package doesn't yet provide this,
    see: https://github.com/confluentinc/confluent-kafka-python/issues/223
    """

        self.logger.info(
            f"Finding topics associated with {self._src_group_id}...")

        # Test that we have a kafka-consumer-groups handy...
        if subprocess.run(['which', 'kafka-consumer-groups']).returncode == 1:
            raise OSError("No 'kafka-consumer-groups' command found in $PATH")

        if cg is None:
            cg = self._src_group_id

        cmd = f"kafka-consumer-groups --bootstrap-server {self._src_bootstrap_servers} --describe --group {cg}  2>/dev/null| grep {cg} | grep -v 'Error: Consumer group '| awk '{{print $2}}' | sort -u"
        self.logger.info(f"Running {cmd}")
        res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)

        cg_topics = list()
        for topic in str(res.stdout, 'utf-8').split('\n'):
            if topic != '':
                cg_topics.append(topic)

        print(f">>>>>>{cg_topics}>>>>>")
        if len(cg_topics) == 0:
            raise Exception(
                f"No topics found for consumer group {cg}. Nothing to do. Stopping."
            )

        # If we were configured to run for just one topic in a CG; then return just that topic,
        # but only if it exists in the CG
        if self._src_topic is not None:
            if self._src_topic in cg_topics:
                self.logger.info(
                    "Overriding topic list from CG tool with supplied topic.")
                cg_topics = [self._src_topic]
            else:
                raise Exception(
                    f"{self._src_topic} is not associated with {cg}. Stopping."
                )

        self.logger.info(f"Returning {cg_topics}...")
        return (cg_topics)
Exemplo n.º 19
0
def get_metrics_for_topic(consumer: Consumer, topic_name: str) -> list:
    # get topic metadata for topic name
    metadata = consumer.list_topics(topic=topic_name, timeout=10)
    committed_partitions = consumer.committed(get_partitions_for_topics(metadata),timeout=10)
    metrics = get_metrics_for_partitions(consumer, committed_partitions)
    return metrics
Exemplo n.º 20
0
class KafkaConsumer(object):
    """
    消费者, 可以消费多个topic, 但是不能同时重置多个topic的offset
    """
    def __init__(self,
                 topic,
                 cfg,
                 cid=None,
                 logger=None,
                 normal=True,
                 debug=False,
                 **kwargs):
        """
        :param topic: [(topic, partition)]
        :param cfg: 共有配置
        :param cid: consumer的id
        :param logger: 外部id实例
        :param normal: 消费者模式
        :param kwargs: 主要用于内部参数传递
                  auto_commit: 是否自动提交 True/False
                  block: 是否阻塞获取,默认为False
        """
        self._cfg = dict(
            cfg,
            **{
                'enable.auto.commit': True,
                'auto.commit.interval.ms': 1000,
                'fetch.min.bytes': 1024 * 1024,  # 一次获取多大消息
                'fetch.wait.max.ms': 1000,  # 耗费多少时间填充信息
                'fetch.message.max.bytes': 1048576,  # 批量信息最大长度
                'on_commit': self._on_commit,
                # 'offset.store.method': 'broker',
                # 'enable.auto.offset.store': True,
                'default.topic.config': {
                    'auto.offset.reset': 'earliest',
                },
                'debug': ','.join([cfg.get('debug', ''), 'cgrp,topic,fetch']),
            })

        if debug is False:
            del self._cfg['debug']

        self._cfg['enable.auto.commit'] = kwargs.get('auto_commit', True)

        self._id = cid  # 当前consumer_id
        self._topic = topic  # 当前监听的topic
        self._create_time = time.time()  # 创建时间

        self._block = kwargs.get('block', False)  # 是否阻塞获取

        self._logger = logger
        self._start = True
        self._normal = normal

        self._start_offset = defaultdict(dict)  # 获取的起始offset
        self._end_offset = defaultdict(dict)  # 最后获取的offset
        self._total_offset = None  # 开始运行时的offset范围
        self._ori_offset = None  # 开始运行时

        self._consumer = Consumer(**self._cfg)

        if self._normal:
            self._total_offset = self.total_offset(self._topic)
            self._logger.info("total offset >>> \n{}".format(
                KafkaConsumer._convert_to_show(self._total_offset)))
            self._ori_offset = self.current_offset()
            self._logger.info("current offset >>> \n{}".format(
                KafkaConsumer._convert_to_show(self._ori_offset)))
            self._consumer.subscribe(self._topic,
                                     on_assign=self._on_assign,
                                     on_revoke=self._on_revoke)

    # FIXME 没有清楚何时调用
    def _on_assign(self, c, ps):
        # print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!assign!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        # print(c, ps)
        pass

    # FIXME 没有清楚何时调用
    def _on_revoke(self, c, ps):
        # print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!revoke!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        # print(c, ps)
        pass

    def _on_commit(self, err, partitions):
        pass

    def reset_offset(self, offsets):
        """
        目前重置offset的基本单位是topic, 所有partition都一起重置
        :param offsets:  dict
        :return:
        """
        _range = dict()

        def get_value(topic, partition, offset, p_offset=_range):
            """
            获取offset值并解析
            :param topic:
            :param partition:
            :param offset:
            :param p_offset:
            :return:
            """
            # FIXME 官方不支持commit 提交OFFSET_BEGINNING, OFFSET_END, 使用total_offset替代
            if offset in ('min', 'max'):
                if topic not in p_offset or partition not in p_offset[topic]:
                    _total = self.total_offset([topic])
                    p_offset.update(dict(p_offset, **_total))
                    if _total is None:
                        return None
                return p_offset[topic][partition][
                    0] if offset == 'min' else p_offset[topic][partition][1]
                # return confluent_kafka.OFFSET_BEGINNING if para == 'min' \
                #                                         else confluent_kafka.OFFSET_END
            if isinstance(offset, int) and offset >= 0:
                return offset

            self._logger.warning(
                'unknown reset value: {}, will not reset offset'.format(
                    offset))
            return None

        if isinstance(offsets, dict):
            assigns = [
                TopicPartition(_topic, 0, get_value(_topic, 0, _offset))
                for _topic, _offset in offsets.items()
                if get_value(_topic, 0, _offset) is not None
            ]
        elif isinstance(offsets, list):
            assigns = [
                TopicPartition(_item[0], _item[1], get_value(*_item))
                for _item in offsets if get_value(*_item) is not None
            ]
        else:
            self._logger.warning(
                'unknown type: {} for param[offsets], will not reset offset'.
                format(type(offsets).__name__))
            return False

        if assigns:
            [
                self._logger.warning(
                    'RESET offset to {0} for topic {1}({2})'.format(
                        _t.offset, _t.topic, _t.partition)) for _t in assigns
            ]
            return self.commit(assigns, if_format=True)

        return not offsets

    def get(self):
        """

        :return: 如果有结果返回dict {
           'topic': xx,
           'partition': xx,
           'offset': xx,
           'tm':
           'data': xx
        },
        没有结果返回None,出错返回-1
        """
        if self._block:
            result = self._consumer.poll()
        else:
            result = self._consumer.poll(timeout=0.00001)

        if not result:
            return None

        if result.error():
            if result.error().code() == KafkaError._PARTITION_EOF:
                self._logger.warning(
                    '%s [%d] reached end at offset %d\n'.format(
                        result.topic(), result.partition(), result.offset()))
            else:
                self._logger.error('encourage error:\n{}'.format(
                    json.dumps(
                        {
                            'name': result.error().name(),
                            'code': result.error().code(),
                            'description': result.error().str()
                        },
                        indent=1)))

            return None

        # 记录处理的数据数量
        self._restore_offset_info(result)

        return KafkaMessage(
            topic=result.topic(),
            partition=result.partition(),
            offset=result.offset(),
            timestamp=result.timestamp()[1]
            if result.timestamp()[0] != confluent_kafka.TIMESTAMP_NOT_AVAILABLE
            else None,
            value=result.value())

    @classmethod
    def _convert_to_show(cls, data, show=True):
        """
        将格式转换成展示的格式
        :param data:
        :return:
        """
        result = {
            i + '(' + str(k) + ')': str(l)
            for i, j in data.items() for k, l in j.items()
        }

        return json.dumps(result, indent=1) if show else result

    def _restore_offset_info(self, msg):
        """
        记录消费的信息
        :param msg:
        :return:
        """
        if msg.topic() not in self._start_offset or \
                msg.partition() not in self._start_offset[msg.topic()]:
            self._start_offset[msg.topic()][msg.partition()] = msg.offset()
        self._end_offset[msg.topic()][msg.partition()] = msg.offset()

    def commit(self, offsets=None, if_format=False):
        """
        手动提交, 提交最后一个消费的消息, 或者提交指定的offset
        :return:
        """
        if if_format:
            topics = offsets
        else:
            src = self._end_offset if offsets is None else offsets
            topics = [
                TopicPartition(i, k, m) for i, j in src.items()
                for k, m in j.items()
            ]

        for _ in range(3):
            try:
                self._consumer.commit(offsets=topics, async=False)
                return True
            except (confluent_kafka.KafkaException, ) as e:
                _exp_name, exp_code = e.args[0].name(), e.args[0].code()
                if int(exp_code) == 27:  # REBALANCE_IN_PROGRESS
                    self._logger.warning(
                        'COMMIT: kafka server is now in rebalancing, will retry...'
                    )
                    time.sleep(1)
                    continue
                self._logger.error(
                    'COMMIT: commit offset failed with message: {0}({1}) >> {2}'
                    .format(e.args[0].name(), e.args[0].code(),
                            e.args[0].str()))
                return False

        self._logger.error('COMMIT: commit failed after 3 times retry')
        return False

    def current_offset(self):
        """
        获取topic目前消费的位置
        :return:
        """
        result = defaultdict(dict)

        if not self._total_offset:
            self._total_offset = self.total_offset(self._topic)
        _p_topics = [
            TopicPartition(_key, _sub_key, -1)
            for _key, _value in self._total_offset.items()
            for _sub_key in _value.keys()
        ]

        try:
            r = self._consumer.committed(_p_topics)
            for _d in r:
                result[_d.topic][_d.partition] = _d.offset
            return result
        except (confluent_kafka.KafkaException, ) as e:
            self._logger.error(
                'get total_offset failed with message: {0}({1}) >> {2}'.format(
                    e.args[0].name(), e.args[0].code(), e.args[0].str()))
            return None

    def total_offset(self, topics=None):
        """
        get smallest and biggest offset for specified topics
        :param topics: should be a list, exp: [topic,topic,topic]
        :return:
        """
        result = defaultdict(dict)

        for _topic in topics:
            try:
                for _n in range(100):
                    r = self._consumer.get_watermark_offsets(TopicPartition(
                        _topic, _n, -1),
                                                             timeout=30,
                                                             cached=False)
                    result[_topic][_n] = tuple(int(i) for i in r)
            except (confluent_kafka.KafkaException, ) as e:
                name, code, e_str = e.args[0].name(), e.args[0].code(
                ), e.args[0].str()
                if name == '_UNKNOWN_PARTITION':
                    continue
                else:
                    raise

        return result

    # FIXME not work
    def _commit_cb(self, err, reqs):
        self._logger.info('commit callback')
        self._logger.info(err)
        self._logger.info(reqs)

    def stop(self):
        if self._start:
            self._logger.debug('consumer stopped')
            self._consumer.close()
            self._start = False

            if self._normal:
                left, right = KafkaConsumer._convert_to_show(self._start_offset, show=False),\
                              KafkaConsumer._convert_to_show(self._end_offset, show=False)

                _out = {
                    i: ' - '.join((left[i], right[i]))
                    for i in right.keys()
                }

                self._logger.info(
                    'CONSUMER SUM UP:\n'
                    '-create time: {0}[{1}]\n'
                    '-consume offsets: \n{2}\n'.format(  # self._id,
                        self._create_time,
                        time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(self._create_time)),
                        json.dumps(_out, indent=1),
                    ))

    def __del__(self):
        self.stop()