예제 #1
0
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({
        'group.id': 'test',
        'socket.timeout.ms': '100',
        'session.timeout.ms': 1000,  # Avoid close() blocking too long
        'on_commit': dummy_commit_cb
    })

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"],
                 on_assign=dummy_assign_revoke,
                 on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(
        map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0],
                                          timeout=0.5,
                                          cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions
                if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._TRANSPORT)

    kc.close()
예제 #2
0
class SynchronizedConsumer:
    """
    This class implements the framework for a consumer that is intended to only
    consume messages that have already been consumed and committed by members
    of another consumer group.

    This works similarly to the Kafka built-in ``__consumer_offsets`` topic.
    The consumer group that is being "followed" (the one that must make
    progress for our consumer here to make progress, identified by the
    ``synchronize_commit_group`` constructor parameter/instance attribute) must
    report its offsets to a topic (identified by the ``commit_log_topic``
    constructor parameter/instance attribute). This consumer subscribes to both
    commit log topic, as well as the topic(s) that we are actually interested
    in consuming messages from. The messages received from the commit log topic
    control whether or not consumption from partitions belonging to the main
    topic is paused, resumed, or allowed to continue in its current state
    without changes.

    The furthest point in any partition that this consumer should ever consume
    to is the maximum offset that has been recorded to the commit log topic for
    that partition. If the offsets recorded to that topic move
    non-monotonically (due to an intentional offset rollback, for instance)
    this consumer *may* consume up to the highest watermark point. (The
    implementation here tries to pause consuming from the partition as soon as
    possible, but this makes no explicit guarantees about that behavior.)
    """

    initial_offset_reset_strategies = {
        "earliest": get_earliest_offset,
        "latest": get_latest_offset
    }

    def __init__(
        self,
        cluster_name,
        consumer_group,
        commit_log_topic,
        synchronize_commit_group,
        initial_offset_reset="latest",
        on_commit=None,
    ):
        self.cluster_name = cluster_name
        self.consumer_group = consumer_group
        self.commit_log_topic = commit_log_topic
        self.synchronize_commit_group = synchronize_commit_group
        self.initial_offset_reset = self.initial_offset_reset_strategies[
            initial_offset_reset]

        self.__partition_state_manager = SynchronizedPartitionStateManager(
            self.__on_partition_state_change)
        (
            self.__commit_log_consumer,
            self.__commit_log_consumer_stop_request,
        ) = self.__start_commit_log_consumer()

        self.__positions = {}

        def commit_callback(error, partitions):
            if on_commit is not None:
                return on_commit(error, partitions)

        consumer_configuration = kafka_config.get_kafka_consumer_cluster_options(
            cluster_name,
            override_params={
                "group.id": self.consumer_group,
                "enable.auto.commit": "false",
                "enable.auto.offset.store": "true",
                "enable.partition.eof": "false",
                "default.topic.config": {
                    "auto.offset.reset": "error"
                },
                "on_commit": commit_callback,
            },
        )

        self.__consumer = Consumer(consumer_configuration)

    def __start_commit_log_consumer(self, timeout=None):
        """
        Starts running the commit log consumer.
        """
        stop_request_event = threading.Event()
        start_event = threading.Event()
        result = execute(
            functools.partial(
                run_commit_log_consumer,
                cluster_name=self.cluster_name,
                consumer_group=f"{self.consumer_group}:sync:{uuid.uuid1().hex}",
                commit_log_topic=self.commit_log_topic,
                synchronize_commit_group=self.synchronize_commit_group,
                partition_state_manager=self.__partition_state_manager,
                start_event=start_event,
                stop_request_event=stop_request_event,
            ))
        start_event.wait(timeout)
        return result, stop_request_event

    def __check_commit_log_consumer_running(self):
        if not self.__commit_log_consumer.running():
            try:
                result = self.__commit_log_consumer.result(timeout=0)  # noqa
            except TimeoutError:
                pass  # not helpful

            raise Exception("Commit log consumer unexpectedly exit!")

    def __on_partition_state_change(self, topic, partition,
                                    previous_state_and_offsets,
                                    current_state_and_offsets):
        """
        Callback that is invoked when a partition state changes.
        """
        logger.debug(
            "State change for %r: %r to %r",
            (topic, partition),
            previous_state_and_offsets,
            current_state_and_offsets,
        )

        current_state, current_offsets = current_state_and_offsets
        if current_offsets.local is None:
            # It only makes sense to manipulate the consumer if we've got an
            # assignment. (This block should only be entered at startup if the
            # remote offsets are retrieved from the commit log before the local
            # consumer has received its assignment.)
            return

        # TODO: This will be called from the commit log consumer thread, so need
        # to verify that calling the ``consumer.{pause,resume}`` methods is
        # thread safe!
        if current_state in (
                SynchronizedPartitionState.UNKNOWN,
                SynchronizedPartitionState.SYNCHRONIZED,
                SynchronizedPartitionState.REMOTE_BEHIND,
        ):
            self.__consumer.pause(
                [TopicPartition(topic, partition, current_offsets.local)])
        elif current_state is SynchronizedPartitionState.LOCAL_BEHIND:
            self.__consumer.resume(
                [TopicPartition(topic, partition, current_offsets.local)])
        else:
            raise NotImplementedError(
                f"Unexpected partition state: {current_state}")

    def subscribe(self, topics, on_assign=None, on_revoke=None):
        """
        Subscribe to a topic.
        """
        self.__check_commit_log_consumer_running()

        def assignment_callback(consumer, assignment):
            # Since ``auto.offset.reset`` is set to ``error`` to force human
            # interaction on an offset reset, we have to explicitly specify the
            # starting offset if no offset has been committed for this topic during
            # the ``__consumer_offsets`` topic retention period.
            assignment = {(i.topic, i.partition): self.__positions.get(
                (i.topic, i.partition))
                          for i in assignment}

            for i in self.__consumer.committed([
                    TopicPartition(topic, partition)
                    for (topic, partition), offset in assignment.items()
                    if offset is None
            ]):
                k = (i.topic, i.partition)
                if i.offset > -1:
                    assignment[k] = i.offset
                else:
                    assignment[k] = self.initial_offset_reset(
                        consumer, i.topic, i.partition)

            self.__consumer.assign([
                TopicPartition(topic, partition, offset)
                for (topic, partition), offset in assignment.items()
            ])

            for (topic, partition), offset in assignment.items():
                # Setting the local offsets will either cause the partition to be
                # paused (if the remote offset is unknown or the local offset is
                # not trailing the remote offset) or resumed.
                self.__partition_state_manager.set_local_offset(
                    topic, partition, offset)
                self.__positions[(topic, partition)] = offset

            if on_assign is not None:
                on_assign(
                    self,
                    [
                        TopicPartition(topic, partition)
                        for topic, partition in assignment.keys()
                    ],
                )

        def revocation_callback(consumer, assignment):
            for item in assignment:
                # TODO: This should probably also be removed from the state manager.
                self.__positions.pop((item.topic, item.partition))

            if on_revoke is not None:
                on_revoke(self, assignment)

        self.__consumer.subscribe(topics,
                                  on_assign=assignment_callback,
                                  on_revoke=revocation_callback)

    def poll(self, timeout):
        self.__check_commit_log_consumer_running()

        message = self.__consumer.poll(timeout)
        if message is None:
            return

        if message.error() is not None:
            return message

        self.__partition_state_manager.validate_local_message(
            message.topic(), message.partition(), message.offset())
        self.__partition_state_manager.set_local_offset(
            message.topic(), message.partition(),
            message.offset() + 1)
        self.__positions[(message.topic(),
                          message.partition())] = message.offset() + 1

        return message

    def commit(self, *args, **kwargs):
        self.__check_commit_log_consumer_running()

        return self.__consumer.commit(*args, **kwargs)

    def close(self):
        self.__check_commit_log_consumer_running()

        self.__commit_log_consumer_stop_request.set()
        try:
            self.__consumer.close()
        finally:
            self.__commit_log_consumer.result()
예제 #3
0
파일: kafka.py 프로젝트: ruezetle/snuba
class KafkaConsumer(Consumer[TPayload]):
    """
    The behavior of this consumer differs slightly from the Confluent
    consumer during rebalancing operations. Whenever a partition is assigned
    to this consumer, offsets are *always* automatically reset to the
    committed offset for that partition (or if no offsets have been committed
    for that partition, the offset is reset in accordance with the
    ``auto.offset.reset`` configuration value.) This causes partitions that
    are maintained across a rebalance to have the same offset management
    behavior as a partition that is moved from one consumer to another. To
    prevent uncommitted messages from being consumed multiple times,
    ``commit`` should be called in the partition revocation callback.

    The behavior of ``auto.offset.reset`` also differs slightly from the
    Confluent consumer as well: offsets are only reset during initial
    assignment or subsequent rebalancing operations. Any other circumstances
    that would otherwise lead to preemptive offset reset (e.g. the consumer
    tries to read a message that is before the earliest offset, or the
    consumer attempts to read a message that is after the latest offset) will
    cause an exception to be thrown, rather than resetting the offset, as
    this could lead to chunks messages being replayed or skipped, depending
    on the circumstances. This also means that if the committed offset is no
    longer available (such as when reading older messages from the log and
    those messages expire, or reading newer messages from the log and the
    leader crashes and partition ownership fails over to an out-of-date
    replica), the consumer will fail-stop rather than reset to the value of
    ``auto.offset.reset``.
    """

    # Set of logical offsets that do not correspond to actual log positions.
    # These offsets should be considered an implementation detail of the Kafka
    # consumer and not used publically.
    # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25
    LOGICAL_OFFSETS = frozenset(
        [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID])

    def __init__(
        self,
        configuration: Mapping[str, Any],
        codec: Codec[KafkaPayload, TPayload],
        *,
        commit_retry_policy: Optional[RetryPolicy] = None,
    ) -> None:
        if commit_retry_policy is None:
            commit_retry_policy = NoRetryPolicy()

        auto_offset_reset = configuration.get("auto.offset.reset", "largest")
        if auto_offset_reset in {"smallest", "earliest", "beginning"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_earliest)
        elif auto_offset_reset in {"largest", "latest", "end"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_latest)
        elif auto_offset_reset == "error":
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_error)
        else:
            raise ValueError(
                "invalid value for 'auto.offset.reset' configuration")

        if (as_kafka_configuration_bool(
                configuration.get("enable.auto.commit", "true")) is not False):
            raise ValueError(
                "invalid value for 'enable.auto.commit' configuration")

        if (as_kafka_configuration_bool(
                configuration.get("enable.auto.offset.store", "true"))
                is not False):
            raise ValueError(
                "invalid value for 'enable.auto.offset.store' configuration")

        # NOTE: Offsets are explicitly managed as part of the assignment
        # callback, so preemptively resetting offsets is not enabled.
        self.__consumer = ConfluentConsumer({
            **configuration, "auto.offset.reset":
            "error"
        })

        self.__codec = codec

        self.__offsets: MutableMapping[Partition, int] = {}
        self.__staged_offsets: MutableMapping[Partition, int] = {}
        self.__paused: Set[Partition] = set()

        self.__commit_retry_policy = commit_retry_policy

        self.__state = KafkaConsumerState.CONSUMING

    def __resolve_partition_offset_earliest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       low)

    def __resolve_partition_offset_latest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       high)

    def __resolve_partition_offset_error(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        raise ConsumerError("unable to resolve partition offsets")

    def subscribe(
        self,
        topics: Sequence[Topic],
        on_assign: Optional[Callable[[Mapping[Partition, int]], None]] = None,
        on_revoke: Optional[Callable[[Sequence[Partition]], None]] = None,
    ) -> None:
        """
        Subscribe to topics. This replaces a previous subscription.

        This method does not block. The subscription may not be fulfilled
        immediately: instead, the ``on_assign`` and ``on_revoke`` callbacks
        are called when the subscription state changes with the updated
        assignment for this consumer.

        If provided, the ``on_assign`` callback is called with a mapping of
        partitions to their offsets (at this point, the working offset and the
        committed offset are the same for each partition) on each subscription
        change. Similarly, the ``on_revoke`` callback (if provided) is called
        with a sequence of partitions that are being removed from this
        consumer's assignment. (This callback does not include the offsets,
        as the working offset and committed offset may differ, in some cases
        by substantial margin.)

        Raises an ``InvalidState`` exception if called on a closed consumer.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        def assignment_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.ASSIGNING

            try:
                assignment: MutableSequence[ConfluentTopicPartition] = []

                for partition in self.__consumer.committed(partitions):
                    if partition.offset >= 0:
                        assignment.append(partition)
                    elif partition.offset == OFFSET_INVALID:
                        assignment.append(
                            self.__resolve_partition_starting_offset(
                                partition))
                    else:
                        raise ValueError("received unexpected offset")

                offsets: MutableMapping[Partition, int] = {
                    Partition(Topic(i.topic), i.partition): i.offset
                    for i in assignment
                }
                self.__seek(offsets)

                # Ensure that all partitions are resumed on assignment to avoid
                # carrying over state from a previous assignment.
                self.__consumer.resume([
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset)
                    for partition, offset in offsets.items()
                ])

                for partition in offsets:
                    self.__paused.discard(partition)
            except Exception:
                self.__state = KafkaConsumerState.ERROR
                raise

            try:
                if on_assign is not None:
                    on_assign(offsets)
            finally:
                self.__state = KafkaConsumerState.CONSUMING

        def revocation_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.REVOKING

            partitions = [
                Partition(Topic(i.topic), i.partition) for i in partitions
            ]

            try:
                if on_revoke is not None:
                    on_revoke(partitions)
            finally:
                for partition in partitions:
                    # Staged offsets are deleted during partition revocation to
                    # prevent later committing offsets for partitions that are
                    # no longer owned by this consumer.
                    if partition in self.__staged_offsets:
                        logger.warning(
                            "Dropping staged offset for revoked partition (%r)!",
                            partition,
                        )
                        del self.__staged_offsets[partition]

                    try:
                        self.__offsets.pop(partition)
                    except KeyError:
                        # If there was an error during assignment, this
                        # partition may have never been added to the offsets
                        # mapping.
                        logger.warning(
                            "failed to delete offset for unknown partition: %r",
                            partition,
                        )

                    self.__paused.discard(partition)

                self.__state = KafkaConsumerState.CONSUMING

        self.__consumer.subscribe(
            [topic.name for topic in topics],
            on_assign=assignment_callback,
            on_revoke=revocation_callback,
        )

    def unsubscribe(self) -> None:
        """
        Unsubscribe from topics.

        Raises an ``InvalidState`` exception if called on a closed consumer.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        self.__consumer.unsubscribe()

    def poll(self,
             timeout: Optional[float] = None) -> Optional[Message[TPayload]]:
        """
        Return the next message available to be consumed, if one is
        available. If no message is available, this method will block up to
        the ``timeout`` value before returning ``None``. A timeout of
        ``0.0`` represents "do not block", while a timeout of ``None``
        represents "block until a message is available (or forever)".

        Calling this method may also invoke subscription state change
        callbacks.

        This method may also raise an ``EndOfPartition`` error (a subtype of
        ``ConsumerError``) when the consumer has reached the end of a
        partition that it is subscribed to and no additional messages are
        available. The ``partition`` attribute of the raised exception
        specifies the end which partition has been reached. (Since this
        consumer is multiplexing a set of partitions, this exception does not
        mean that *all* of the partitions that the consumer is subscribed to
        do not have any messages, just that it has reached the end of one of
        them. This also does not mean that additional messages won't be
        available in future poll calls.) Not every backend implementation
        supports this feature or is configured to raise in this scenario.

        Raises an ``InvalidState`` exception if called on a closed consumer.

        Raises a ``TransportError`` for various other consumption-related
        errors.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        message: Optional[ConfluentMessage] = self.__consumer.poll(
            *[timeout] if timeout is not None else [])
        if message is None:
            return None

        error: Optional[KafkaError] = message.error()
        if error is not None:
            code = error.code()
            if code == KafkaError._PARTITION_EOF:
                raise EndOfPartition(
                    Partition(Topic(message.topic()), message.partition()),
                    message.offset(),
                )
            elif code == KafkaError._TRANSPORT:
                raise TransportError(str(error))
            else:
                raise ConsumerError(str(error))

        headers: Optional[Headers] = message.headers()
        result = Message(
            Partition(Topic(message.topic()), message.partition()),
            message.offset(),
            self.__codec.decode(
                KafkaPayload(
                    message.key(),
                    message.value(),
                    headers if headers is not None else [],
                )),
            datetime.utcfromtimestamp(message.timestamp()[1] / 1000.0),
        )

        self.__offsets[result.partition] = result.get_next_offset()

        return result

    def tell(self) -> Mapping[Partition, int]:
        """
        Return the read offsets for all assigned partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return self.__offsets

    def __validate_offsets(self, offsets: Mapping[Partition, int]) -> None:
        invalid_offsets: Mapping[Partition, int] = {
            partition: offset
            for partition, offset in offsets.items() if offset < 0
        }

        if invalid_offsets:
            raise ConsumerError(f"invalid offsets: {invalid_offsets!r}")

    def __seek(self, offsets: Mapping[Partition, int]) -> None:
        self.__validate_offsets(offsets)

        if self.__state is KafkaConsumerState.ASSIGNING:
            # Calling ``seek`` on the Confluent consumer from an assignment
            # callback will throw an "Erroneous state" error. Instead,
            # partition offsets have to be initialized by calling ``assign``.
            self.__consumer.assign([
                ConfluentTopicPartition(partition.topic.name, partition.index,
                                        offset)
                for partition, offset in offsets.items()
            ])
        else:
            for partition, offset in offsets.items():
                self.__consumer.seek(
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset))

        self.__offsets.update(offsets)

    def seek(self, offsets: Mapping[Partition, int]) -> None:
        """
        Change the read offsets for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError("cannot seek on unassigned partitions")

        self.__seek(offsets)

    def pause(self, partitions: Sequence[Partition]) -> None:
        """
        Pause the consumption of messages for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if set(partitions) - self.__offsets.keys():
            raise ConsumerError("cannot pause unassigned partitions")

        self.__consumer.pause([
            ConfluentTopicPartition(partition.topic.name, partition.index)
            for partition in partitions
        ])

        self.__paused.update(partitions)

        # XXX: Seeking to a specific partition offset and immediately pausing
        # that partition causes the seek to be ignored for some reason.
        self.seek({
            partition: offset
            for partition, offset in self.__offsets.items()
            if partition in partitions
        })

    def resume(self, partitions: Sequence[Partition]) -> None:
        """
        Resume the consumption of messages for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if set(partitions) - self.__offsets.keys():
            raise ConsumerError("cannot resume unassigned partitions")

        self.__consumer.resume([
            ConfluentTopicPartition(partition.topic.name, partition.index)
            for partition in partitions
        ])

        for partition in partitions:
            self.__paused.discard(partition)

    def paused(self) -> Sequence[Partition]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return [*self.__paused]

    def stage_offsets(self, offsets: Mapping[Partition, int]) -> None:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError(
                "cannot stage offsets for unassigned partitions")

        self.__validate_offsets(offsets)

        # TODO: Maybe log a warning if these offsets exceed the current
        # offsets, since that's probably a side effect of an incorrect usage
        # pattern?
        self.__staged_offsets.update(offsets)

    def __commit(self) -> Mapping[Partition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        result: Optional[Sequence[ConfluentTopicPartition]]

        if self.__staged_offsets:
            result = self.__consumer.commit(
                offsets=[
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset)
                    for partition, offset in self.__staged_offsets.items()
                ],
                asynchronous=False,
            )
        else:
            result = []

        assert result is not None  # synchronous commit should return result immediately

        self.__staged_offsets.clear()

        offsets: MutableMapping[Partition, int] = {}

        for value in result:
            # The Confluent Kafka Consumer will include logical offsets in the
            # sequence of ``Partition`` objects returned by ``commit``. These
            # are an implementation detail of the Kafka Consumer, so we don't
            # expose them here.
            # NOTE: These should no longer be seen now that we are forcing
            # offsets to be set as part of the assignment callback.
            if value.offset in self.LOGICAL_OFFSETS:
                continue

            assert value.offset >= 0, "expected non-negative offset"
            offsets[Partition(Topic(value.topic),
                              value.partition)] = value.offset

        return offsets

    def commit_offsets(self) -> Mapping[Partition, int]:
        """
        Commit staged offsets for all partitions that this consumer is
        assigned to. The return value of this method is a mapping of
        partitions with their committed offsets as values.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        return self.__commit_retry_policy.call(self.__commit)

    def close(self, timeout: Optional[float] = None) -> None:
        """
        Close the consumer. This stops consuming messages, *may* commit
        staged offsets (depending on the configuration), and ends its
        subscription.

        Raises a ``InvalidState`` if the consumer is unable to be closed
        before the timeout is reached.
        """
        try:
            self.__consumer.close()
        except RuntimeError:
            pass

        self.__state = KafkaConsumerState.CLOSED

    @property
    def closed(self) -> bool:
        return self.__state is KafkaConsumerState.CLOSED
예제 #4
0
파일: consumer.py 프로젝트: Kayle009/sentry
class SynchronizedConsumer(object):
    """
    This class implements the framework for a consumer that is intended to only
    consume messages that have already been consumed and committed by members
    of another consumer group.

    This works similarly to the Kafka built-in ``__consumer_offsets`` topic.
    The consumer group that is being "followed" (the one that must make
    progress for our consumer here to make progress, identified by the
    ``synchronize_commit_group`` constructor parameter/instance attribute) must
    report its offsets to a topic (identified by the ``commit_log_topic``
    constructor parameter/instance attribute). This consumer subscribes to both
    commit log topic, as well as the topic(s) that we are actually interested
    in consuming messages from. The messages received from the commit log topic
    control whether or not consumption from partitions belonging to the main
    topic is paused, resumed, or allowed to continue in its current state
    without changes.

    The furthest point in any partition that this consumer should ever consume
    to is the maximum offset that has been recorded to the commit log topic for
    that partition. If the offsets recorded to that topic move
    non-monotonically (due to an intentional offset rollback, for instance)
    this consumer *may* consume up to the highest watermark point. (The
    implementation here tries to pause consuming from the partition as soon as
    possible, but this makes no explicit guarantees about that behavior.)
    """
    initial_offset_reset_strategies = {
        'earliest': get_earliest_offset,
        'latest': get_latest_offset,
    }

    def __init__(self, bootstrap_servers, consumer_group, commit_log_topic,
                 synchronize_commit_group, initial_offset_reset='latest', on_commit=None):
        self.bootstrap_servers = bootstrap_servers
        self.consumer_group = consumer_group
        self.commit_log_topic = commit_log_topic
        self.synchronize_commit_group = synchronize_commit_group
        self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset]

        self.__partition_state_manager = SynchronizedPartitionStateManager(
            self.__on_partition_state_change)
        self.__commit_log_consumer, self.__commit_log_consumer_stop_request = self.__start_commit_log_consumer()

        self.__positions = {}

        def commit_callback(error, partitions):
            if on_commit is not None:
                return on_commit(error, partitions)

        consumer_configuration = {
            'bootstrap.servers': self.bootstrap_servers,
            'group.id': self.consumer_group,
            'enable.auto.commit': 'false',
            'enable.auto.offset.store': 'true',
            'enable.partition.eof': 'false',
            'default.topic.config': {
                'auto.offset.reset': 'error',
            },
            'on_commit': commit_callback,
        }

        self.__consumer = Consumer(consumer_configuration)

    def __start_commit_log_consumer(self, timeout=None):
        """
        Starts running the commit log consumer.
        """
        stop_request_event = threading.Event()
        start_event = threading.Event()
        result = execute(
            functools.partial(
                run_commit_log_consumer,
                bootstrap_servers=self.bootstrap_servers,
                consumer_group='{}:sync:{}'.format(self.consumer_group, uuid.uuid1().hex),
                commit_log_topic=self.commit_log_topic,
                synchronize_commit_group=self.synchronize_commit_group,
                partition_state_manager=self.__partition_state_manager,
                start_event=start_event,
                stop_request_event=stop_request_event,
            ),
        )
        start_event.wait(timeout)
        return result, stop_request_event

    def __check_commit_log_consumer_running(self):
        if not self.__commit_log_consumer.running():
            try:
                result = self.__commit_log_consumer.result(timeout=0)  # noqa
            except TimeoutError:
                pass  # not helpful

            raise Exception('Commit log consumer unexpectedly exit!')

    def __on_partition_state_change(
            self, topic, partition, previous_state_and_offsets, current_state_and_offsets):
        """
        Callback that is invoked when a partition state changes.
        """
        logger.debug('State change for %r: %r to %r', (topic, partition),
                     previous_state_and_offsets, current_state_and_offsets)

        current_state, current_offsets = current_state_and_offsets
        if current_offsets.local is None:
            # It only makes sense to manipulate the consumer if we've got an
            # assignment. (This block should only be entered at startup if the
            # remote offsets are retrieved from the commit log before the local
            # consumer has received its assignment.)
            return

        # TODO: This will be called from the commit log consumer thread, so need
        # to verify that calling the ``consumer.{pause,resume}`` methods is
        # thread safe!
        if current_state in (SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED,
                             SynchronizedPartitionState.REMOTE_BEHIND):
            self.__consumer.pause([TopicPartition(topic, partition, current_offsets.local)])
        elif current_state is SynchronizedPartitionState.LOCAL_BEHIND:
            self.__consumer.resume([TopicPartition(topic, partition, current_offsets.local)])
        else:
            raise NotImplementedError('Unexpected partition state: %s' % (current_state,))

    def subscribe(self, topics, on_assign=None, on_revoke=None):
        """
        Subscribe to a topic.
        """
        self.__check_commit_log_consumer_running()

        def assignment_callback(consumer, assignment):
            # Since ``auto.offset.reset`` is set to ``error`` to force human
            # interaction on an offset reset, we have to explicitly specify the
            # starting offset if no offset has been committed for this topic during
            # the ``__consumer_offsets`` topic retention period.
            assignment = {
                (i.topic, i.partition): self.__positions.get((i.topic, i.partition)) for i in assignment
            }

            for i in self.__consumer.committed([TopicPartition(topic, partition) for (
                    topic, partition), offset in assignment.items() if offset is None]):
                k = (i.topic, i.partition)
                if i.offset > -1:
                    assignment[k] = i.offset
                else:
                    assignment[k] = self.initial_offset_reset(consumer, i.topic, i.partition)

            self.__consumer.assign([TopicPartition(topic, partition, offset)
                                    for (topic, partition), offset in assignment.items()])

            for (topic, partition), offset in assignment.items():
                # Setting the local offsets will either cause the partition to be
                # paused (if the remote offset is unknown or the local offset is
                # not trailing the remote offset) or resumed.
                self.__partition_state_manager.set_local_offset(topic, partition, offset)
                self.__positions[(topic, partition)] = offset

            if on_assign is not None:
                on_assign(self, [TopicPartition(topic, partition)
                                 for topic, partition in assignment.keys()])

        def revocation_callback(consumer, assignment):
            for item in assignment:
                # TODO: This should probably also be removed from the state manager.
                self.__positions.pop((item.topic, item.partition))

            if on_revoke is not None:
                on_revoke(self, assignment)

        self.__consumer.subscribe(
            topics,
            on_assign=assignment_callback,
            on_revoke=revocation_callback)

    def poll(self, timeout):
        self.__check_commit_log_consumer_running()

        message = self.__consumer.poll(timeout)
        if message is None:
            return

        if message.error() is not None:
            return message

        self.__partition_state_manager.validate_local_message(
            message.topic(), message.partition(), message.offset())
        self.__partition_state_manager.set_local_offset(
            message.topic(), message.partition(), message.offset() + 1)
        self.__positions[(message.topic(), message.partition())] = message.offset() + 1

        return message

    def commit(self, *args, **kwargs):
        self.__check_commit_log_consumer_running()

        return self.__consumer.commit(*args, **kwargs)

    def close(self):
        self.__check_commit_log_consumer_running()

        self.__commit_log_consumer_stop_request.set()
        try:
            self.__consumer.close()
        finally:
            self.__commit_log_consumer.result()
def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100',
                   'session.timeout.ms': 1000,  # Avoid close() blocking too long
                   'on_commit': dummy_commit_cb})

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    kc.close()