Пример #1
0
    def commit(self, partitions=None):
        """XXX"""

        # short circuit if nothing happened. This check is kept outside
        # to prevent un-necessarily acquiring a lock for checking the state
        if self._count_since_commit == 0:
            return

        with (yield from self._commit_lock):
            # Do this check again, just in case the state has changed
            # during the lock acquiring timeout
            if self._count_since_commit == 0:
                return

            reqs = []
            if not partitions:  # commit all partitions
                partitions = self._offsets.keys()

            for partition in partitions:
                offset = self._offsets[partition]
                log.debug("Commit offset %d in SimpleConsumer: "
                          "group=%s, topic=%s, partition=%s" %
                          (offset, self._group, self._topic, partition))

                reqs.append(OffsetCommitRequest(self._topic, partition,
                                                offset, None))

            resps = yield from self._client.send_offset_commit_request(
                self._group, reqs)

            for resp in resps:
                check_error(resp)
            self._count_since_commit = 0
Пример #2
0
    def commit(self, partitions=None):
        """XXX"""

        # short circuit if nothing happened. This check is kept outside
        # to prevent un-necessarily acquiring a lock for checking the state
        if self._count_since_commit == 0:
            return

        with (yield from self._commit_lock):
            # Do this check again, just in case the state has changed
            # during the lock acquiring timeout
            if self._count_since_commit == 0:
                return

            reqs = []
            if not partitions:  # commit all partitions
                partitions = self._offsets.keys()

            for partition in partitions:
                offset = self._offsets[partition]
                log.debug("Commit offset %d in SimpleConsumer: "
                          "group=%s, topic=%s, partition=%s" %
                          (offset, self._group, self._topic, partition))

                reqs.append(
                    OffsetCommitRequest(self._topic, partition, offset, None))

            resps = yield from self._client.send_offset_commit_request(
                self._group, reqs)

            for resp in resps:
                check_error(resp)
            self._count_since_commit = 0
Пример #3
0
    def get_partition_offsets(self, topic, partition, request_time_ms,
                              max_num_offsets):
        """
        Request available fetch offsets for a single topic/partition

        Arguments:
            topic (str)
            partition (int)
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values. Specify -1 to receive the latest
                offset (i.e. the offset of the next coming message) and -2 to receive the earliest
                available offset. Note that because offsets are pulled in descending order, asking for
                the earliest offset will always return you a single element.
            max_num_offsets (int)

        Returns:
            offsets (list)
        """
        reqs = [
            OffsetRequest(topic, partition, request_time_ms, max_num_offsets)
        ]

        (resp, ) = self._client.send_offset_request(reqs)

        check_error(resp)

        # Just for sanity..
        # probably unnecessary
        assert resp.topic == topic
        assert resp.partition == partition

        return resp.offsets
Пример #4
0
    def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets):
        """Request available fetch offsets for a single topic/partition

        Keyword Arguments:
            topic (str): topic for offset request
            partition (int): partition for offset request
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values.
                Specify -1 to receive the latest offset (i.e. the offset of the
                next coming message) and -2 to receive the earliest available
                offset. Note that because offsets are pulled in descending
                order, asking for the earliest offset will always return you a
                single element.
            max_num_offsets (int): Maximum offsets to include in the OffsetResponse

        Returns:
            a list of offsets in the OffsetResponse submitted for the provided
            topic / partition. See:
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI
        """
        reqs = [OffsetRequest(topic, partition, request_time_ms, max_num_offsets)]

        (resp,) = self._client.send_offset_request(reqs)

        check_error(resp)

        # Just for sanity..
        # probably unnecessary
        assert resp.topic == topic
        assert resp.partition == partition

        return resp.offsets
Пример #5
0
    def fetch_last_known_offsets(self, partitions=None):
        if self.group is None:
            raise ValueError('KafkaClient.group must not be None')

        if partitions is None:
            partitions = self.client.get_partition_ids_for_topic(self.topic)

        responses = self.client.send_offset_fetch_request(
            self.group,
            [OffsetFetchRequest(self.topic, p) for p in partitions],
            fail_on_error=False
        )

        for resp in responses:
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self.offsets[resp.partition] = 0

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self.offsets[resp.partition] = resp.offset
Пример #6
0
    def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets):
        """
        Request available fetch offsets for a single topic/partition

        Arguments:
            topic (str)
            partition (int)
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values. Specify -1 to receive the latest
                offset (i.e. the offset of the next coming message) and -2 to receive the earliest
                available offset. Note that because offsets are pulled in descending order, asking for
                the earliest offset will always return you a single element.
            max_num_offsets (int)

        Returns:
            offsets (list)
        """
        reqs = [OffsetRequest(topic, partition, request_time_ms, max_num_offsets)]

        (resp,) = self._client.send_offset_request(reqs)

        check_error(resp)

        # Just for sanity..
        # probably unnecessary
        assert resp.topic == topic
        assert resp.partition == partition

        return resp.offsets
Пример #7
0
    def fetch_last_known_offsets(self, partitions=None):
        if self.group is None:
            raise ValueError('KafkaClient.group must not be None')

        if partitions is None:
            partitions = self.client.get_partition_ids_for_topic(self.topic)

        responses = self.client.send_offset_fetch_request(
            self.group,
            [OffsetFetchRequest(self.topic, p) for p in partitions],
            fail_on_error=False)

        for resp in responses:
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self.offsets[resp.partition] = 0

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self.offsets[resp.partition] = resp.offset
Пример #8
0
    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            resps = []
            if self._config['offset_storage'] in ('zookeeper', 'dual'):
                resps += self._client.send_offset_fetch_request(
                    self._config['group_id'], [
                        OffsetFetchRequestPayload(topic_partition[0],
                                                  topic_partition[1])
                    ],
                    fail_on_error=False)
            if self._config['offset_storage'] in ('kafka', 'dual'):
                resps += self._client.send_offset_fetch_request_kafka(
                    self._config['group_id'], [
                        OffsetFetchRequestPayload(topic_partition[0],
                                                  topic_partition[1])
                    ],
                    fail_on_error=False)
            try:
                for r in resps:
                    check_error(r)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            max_offset = max(r.offset for r in resps)
            if max_offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = max_offset
Пример #9
0
    def get_partition_offsets(self, topic, partition, request_time_ms,
                              max_num_offsets):
        """Request available fetch offsets for a single topic/partition

        Keyword Arguments:
            topic (str): topic for offset request
            partition (int): partition for offset request
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values.
                Specify -1 to receive the latest offset (i.e. the offset of the
                next coming message) and -2 to receive the earliest available
                offset. Note that because offsets are pulled in descending
                order, asking for the earliest offset will always return you a
                single element.
            max_num_offsets (int): Maximum offsets to include in the OffsetResponse

        Returns:
            a list of offsets in the OffsetResponse submitted for the provided
            topic / partition. See:
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI
        """
        reqs = [
            OffsetRequest(topic, partition, request_time_ms, max_num_offsets)
        ]

        (resp, ) = self._client.send_offset_request(reqs)

        check_error(resp)

        # Just for sanity..
        # probably unnecessary
        assert resp.topic == topic
        assert resp.partition == partition

        return resp.offsets
Пример #10
0
    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            resps = []
            if self._config['offset_storage'] in ('zookeeper', 'dual'):
                resps += self._client.send_offset_fetch_request(
                    self._config['group_id'],
                    [OffsetFetchRequestPayload(topic_partition[0], topic_partition[1])],
                    fail_on_error=False)
            if self._config['offset_storage'] in ('kafka', 'dual'):
                resps += self._client.send_offset_fetch_request_kafka(
                    self._config['group_id'],
                    [OffsetFetchRequestPayload(topic_partition[0], topic_partition[1])],
                    fail_on_error=False)
            try:
                for r in resps:
                  check_error(r)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            max_offset = max(r.offset for r in resps)
            if max_offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = max_offset
Пример #11
0
    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError('Attempted to commit offsets without a configured consumer group (group_id)')

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(OffsetCommitRequest(topic_partition[0], topic_partition[1], commit_offset, metadata))

        if commits:
            logger.info('committing consumer offsets to group %s', self._config['group_id'])
            resps = self._client.send_offset_commit_request(kafka_bytestring(self._config['group_id']),
                                                            commits,
                                                            fail_on_error=False)

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s', self._config['group_id'])
            return False
Пример #12
0
    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError('Attempted to commit offsets without a configured consumer group (group_id)')

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(OffsetCommitRequest(topic_partition[0], topic_partition[1], commit_offset, metadata))

        if commits:
            logger.info('committing consumer offsets to group %s', self._config['group_id'])
            resps = self._client.send_offset_commit_request(kafka_bytestring(self._config['group_id']),
                                                            commits,
                                                            fail_on_error=False)

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s', self._config['group_id'])
            return False
Пример #13
0
def _check_commit_response_error(resp):
    try:
        check_error(resp)
    except BrokerResponseError as e:
        exception = OffsetCommitError(
            resp.topic,
            resp.partition,
            e.message
        )
        return exception
Пример #14
0
def _check_fetch_response_error(resp):
    try:
        check_error(resp)
    except BrokerResponseError:
        # In case of error we set the offset to (-1,)
        return OffsetResponse(
            resp.topic,
            resp.partition,
            resp.error,
            (-1, ),
        )
    return resp
Пример #15
0
def _check_fetch_response_error(resp):
    try:
        check_error(resp)
    except BrokerResponseError:
        # In case of error we set the offset to (-1,)
        return OffsetResponse(
            resp.topic,
            resp.partition,
            resp.error,
            (-1,),
        )
    return resp
Пример #16
0
    def _update_group_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for partition in self._client.get_partition_ids_for_topic(self._topic):
            (resp, ) = self._client.send_offset_fetch_request(
                self._group_id, [OffsetFetchRequest(self._topic, partition)],
                fail_on_error=False)
            try:
                check_error(resp)
            except UnknownTopicOrPartitionError:
                pass

            if resp.offset == -1:
                self._offsets.commit[partition] = None
            else:
                self._offsets.commit[partition] = resp.offset
Пример #17
0
    def commit_partition_offsets(self, partition_offsets):
        """
        Commit explicit partition/offset pairs.
        """
        self.logger.debug("Committing partition offsets: %s", partition_offsets)

        commit_requests = [
            OffsetCommitRequest(self.consumer.topic, partition, offset, None)
            for partition, offset in partition_offsets.items()
        ]
        commit_responses = self.consumer.client.send_offset_commit_request(
            self.consumer.group,
            commit_requests,
        )
        for commit_response in commit_responses:
            check_error(commit_response)
Пример #18
0
    def _update_group_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for partition in self._client.get_partition_ids_for_topic(self._topic):
            (resp,) = self._client.send_offset_fetch_request(
                self._group_id,
                [OffsetFetchRequest(self._topic, partition)],
                fail_on_error=False)
            try:
                check_error(resp)
            except UnknownTopicOrPartitionError:
                pass

            if resp.offset == -1:
                self._offsets.commit[partition] = None
            else:
                self._offsets.commit[partition] = resp.offset
Пример #19
0
    def _update_produced_offsets(self):
        """
        Arguments:
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values. Specify -1 to receive the latest
                offset (i.e. the offset of the next coming message) and -2 to receive the earliest
                available offset. Note that because offsets are pulled in descending order, asking for
                the earliest offset will always return you a single element.
        """
        for partition in self._client.get_partition_ids_for_topic(self._topic):
            reqs = [OffsetRequest(self._topic, partition, -1, 1)]

            (resp,) = self._client.send_offset_request(reqs)

            check_error(resp)
            assert resp.topic == self._topic
            assert resp.partition == partition
            self._offsets.produced[partition] = resp.offsets[0]
Пример #20
0
def pluck_topic_offset_or_zero_on_unknown(resp):
    try:
        check_error(resp)
    except UnknownTopicOrPartitionError:
        # If the server doesn't have any commited offsets by this group for
        # this topic, assume it's zero.
        pass
    # The API spec says server wont set an error, but 0.8.1.1 does. The actual
    # check is if the offset is -1.
    if resp.offset == -1:
        return OffsetFetchResponse(
            resp.topic,
            resp.partition,
            0,
            resp.metadata,
            0,
        )
    return resp
Пример #21
0
def pluck_topic_offset_or_zero_on_unknown(resp):
    try:
        check_error(resp)
    except UnknownTopicOrPartitionError:
        # If the server doesn't have any commited offsets by this group for
        # this topic, assume it's zero.
        pass
    # The API spec says server wont set an error, but 0.8.1.1 does. The actual
    # check is if the offset is -1.
    if resp.offset == -1:
        return OffsetFetchResponse(
            resp.topic,
            resp.partition,
            0,
            resp.metadata,
            0,
        )
    return resp
Пример #22
0
    def _update_produced_offsets(self):
        """
        Arguments:
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values. Specify -1 to receive the latest
                offset (i.e. the offset of the next coming message) and -2 to receive the earliest
                available offset. Note that because offsets are pulled in descending order, asking for
                the earliest offset will always return you a single element.
        """
        for partition in self._client.get_partition_ids_for_topic(self._topic):
            reqs = [OffsetRequest(self._topic, partition, -1, 1)]

            (resp, ) = self._client.send_offset_request(reqs)

            check_error(resp)
            assert resp.topic == self._topic
            assert resp.partition == partition
            self._offsets.produced[partition] = resp.offsets[0]
Пример #23
0
    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            (resp, ) = self._client.send_offset_fetch_request(
                kafka_bytestring(self._config['group_id']),
                [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                fail_on_error=False)
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = resp.offset
Пример #24
0
    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            (resp,) = self._client.send_offset_fetch_request(
                kafka_bytestring(self._config['group_id']),
                [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                fail_on_error=False)
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = resp.offset
Пример #25
0
    def reset_partition_offset(self, partition):
        LATEST = -1
        EARLIEST = -2
        if self.auto_offset_reset == 'largest':
            reqs = [OffsetRequest(self.topic, partition, LATEST, 1)]
        elif self.auto_offset_reset == 'smallest':
            reqs = [OffsetRequest(self.topic, partition, EARLIEST, 1)]
        else:
            # Let's raise an reasonable exception type if user calls
            # outside of an exception context
            if sys.exc_info() == (None, None, None):
                raise OffsetOutOfRangeError('Cannot reset partition offsets without a '
                                            'valid auto_offset_reset setting '
                                            '(largest|smallest)')
            # Otherwise we should re-raise the upstream exception
            # b/c it typically includes additional data about
            # the request that triggered it, and we do not want to drop that
            raise

        # send_offset_request
        (resp, ) = self.client.send_offset_request(reqs)
        check_error(resp)
        self.offsets[partition] = resp.offsets[0]
        self.fetch_offsets[partition] = resp.offsets[0]
Пример #26
0
def _check_commit_response_error(resp):
    try:
        check_error(resp)
    except BrokerResponseError as e:
        exception = OffsetCommitError(resp.topic, resp.partition, e.message)
        return exception
Пример #27
0
    def _fetch(self):
        # Create fetch request payloads for all the partitions
        partitions = dict(
            (p, self.buffer_size) for p in self.fetch_offsets.keys())
        while partitions:
            requests = []
            for partition, buffer_size in six.iteritems(partitions):
                requests.append(
                    FetchRequestPayload(self.topic, partition,
                                        self.fetch_offsets[partition],
                                        buffer_size))
            # Send request
            responses = self.client.send_fetch_request(
                requests,
                max_wait_time=int(self.fetch_max_wait_time),
                min_bytes=self.fetch_min_bytes,
                fail_on_error=False)

            retry_partitions = {}
            for resp in responses:

                try:
                    check_error(resp)
                except UnknownTopicOrPartitionError:
                    log.error('UnknownTopicOrPartitionError for %s:%d',
                              resp.topic, resp.partition)
                    self.client.reset_topic_metadata(resp.topic)
                    raise
                except NotLeaderForPartitionError:
                    log.error('NotLeaderForPartitionError for %s:%d',
                              resp.topic, resp.partition)
                    self.client.reset_topic_metadata(resp.topic)
                    continue
                except OffsetOutOfRangeError:
                    log.warning(
                        'OffsetOutOfRangeError for %s:%d. '
                        'Resetting partition offset...', resp.topic,
                        resp.partition)
                    self.reset_partition_offset(resp.partition)
                    # Retry this partition
                    retry_partitions[resp.partition] = partitions[
                        resp.partition]
                    continue
                except FailedPayloadsError as e:
                    log.warning('FailedPayloadsError for %s:%d',
                                e.payload.topic, e.payload.partition)
                    # Retry this partition
                    retry_partitions[e.payload.partition] = partitions[
                        e.payload.partition]
                    continue

                partition = resp.partition
                buffer_size = partitions[partition]

                # Check for partial message
                if resp.messages and isinstance(resp.messages[-1].message,
                                                PartialMessage):

                    # If buffer is at max and all we got was a partial message
                    # raise ConsumerFetchSizeTooSmall
                    if (self.max_buffer_size is not None
                            and buffer_size == self.max_buffer_size
                            and len(resp.messages) == 1):

                        log.error('Max fetch size %d too small',
                                  self.max_buffer_size)
                        raise ConsumerFetchSizeTooSmall()

                    if self.max_buffer_size is None:
                        buffer_size *= 2
                    else:
                        buffer_size = min(buffer_size * 2,
                                          self.max_buffer_size)
                    log.warning(
                        'Fetch size too small, increase to %d (2x) '
                        'and retry', buffer_size)
                    retry_partitions[partition] = buffer_size
                    resp.messages.pop()

                for message in resp.messages:
                    if message.offset < self.fetch_offsets[partition]:
                        log.debug(
                            'Skipping message %s because its offset is less than the consumer offset',
                            message)
                        continue
                    # Put the message in our queue
                    self.queue.put((partition, message))
                    self.fetch_offsets[partition] = message.offset + 1
            partitions = retry_partitions
Пример #28
0
 def _raise_on_response_error(self, resp):
     try:
         check_error(resp)
     except (UnknownTopicOrPartitionError, NotLeaderForPartitionError):
         self.reset_topic_metadata(resp.topic)
         raise
Пример #29
0
    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages')

        fetches = [
            FetchRequest(topic, partition,
                         self._offsets.fetch[(topic, partition)], max_bytes)
            for (topic, partition) in self._topics
        ]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False)

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning(
                    'OffsetOutOfRange: topic %s, partition %d, '
                    'offset %d (Highwatermark: %d)', topic, partition,
                    self._offsets.fetch[(topic, partition)],
                    resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition)))
                continue

            except NotLeaderForPartitionError:
                logger.warning(
                    "NotLeaderForPartitionError for %s - %d. "
                    "Metadata may be out of date", topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d", topic,
                               partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug(
                        'message offset less than fetched offset '
                        'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg
Пример #30
0
    def load_metadata_for_topics(self, *topics):
        """
        Fetch broker and topic-partition metadata from the server,
        and update internal data:
        broker list, topic/partition list, and topic/parition -> broker map

        This method should be called after receiving any error

        @param: *topics (optional)
        If a list of topics is provided, the metadata refresh will be limited
        to the specified topics only.

        Exceptions:
        ----------
        If the broker is configured to not auto-create topics,
        expect UnknownTopicOrPartitionError for topics that don't exist

        If the broker is configured to auto-create topics,
        expect LeaderNotAvailableError for new topics
        until partitions have been initialized.

        Exceptions *will not* be raised in a full refresh (i.e. no topic list)
        In this case, error codes will be logged as errors

        Partition-level errors will also not be raised here
        (a single partition w/o a leader, for example)
        """
        resp = yield from self.send_metadata_request(topics)

        log.debug("Broker metadata: %s", resp.brokers)
        log.debug("Topic metadata: %s", resp.topics)

        self._brokers = dict([(broker.nodeId, broker)
                              for broker in resp.brokers])

        for topic_metadata in resp.topics:
            topic = topic_metadata.topic
            partitions = topic_metadata.partitions

            self.reset_topic_metadata(topic)

            # Errors expected for new topics
            try:
                check_error(topic_metadata)
            except (UnknownTopicOrPartitionError,
                    LeaderNotAvailableError) as e:

                # Raise if the topic was passed in explicitly
                if topic in topics:
                    raise

                # Otherwise, just log a warning
                log.error("Error loading topic metadata for %s: %s",
                          topic, type(e))
                continue

            self._topic_partitions[topic] = {}
            for partition_metadata in partitions:
                partition = partition_metadata.partition
                leader = partition_metadata.leader

                self._topic_partitions[topic][partition] = partition_metadata

                # Populate topics_to_brokers dict
                topic_part = TopicAndPartition(topic, partition)

                # Check for partition errors
                try:
                    check_error(partition_metadata)

                # If No Leader, topics_to_brokers topic_partition -> None
                except LeaderNotAvailableError:
                    log.error('No leader for topic %s partition %d',
                              topic, partition)
                    self._topics_to_brokers[topic_part] = None
                    continue
                # If one of the replicas is unavailable -- ignore
                # this error code is provided for admin purposes only
                # we never talk to replicas, only the leader
                except ReplicaNotAvailableError:
                    log.warning('Some (non-leader) replicas not available '
                                'for topic %s partition %d', topic, partition)
                # If Known Broker, topic_partition -> BrokerMetadata
                if leader in self._brokers:
                    self._topics_to_brokers[topic_part] = self._brokers[leader]

                # If Unknown Broker, fake BrokerMetadata so we dont lose the id
                # (not sure how this could happen.
                # server could be in bad state)
                else:
                    self._topics_to_brokers[topic_part] = BrokerMetadata(
                        leader, None, None
                    )
Пример #31
0
    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages'
            )

        fetches = [FetchRequest(topic, partition,
                                self._offsets.fetch[(topic, partition)],
                                max_bytes)
                   for (topic, partition) in self._topics]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False
        )

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning('OffsetOutOfRange: topic %s, partition %d, '
                               'offset %d (Highwatermark: %d)',
                               topic, partition,
                               self._offsets.fetch[(topic, partition)],
                               resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition))
                )
                continue

            except NotLeaderForPartitionError:
                logger.warning("NotLeaderForPartitionError for %s - %d. "
                               "Metadata may be out of date",
                               topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d",
                               topic, partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug('message offset less than fetched offset '
                                 'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg
Пример #32
0
    def fetch_messages(self):
        """
        Sends FetchRequests for all topic/partitions set for consumption
        Returns a generator that yields KafkaMessage structs
        after deserializing with the configured `deserializer_class`

        Refreshes metadata on errors, and resets fetch offset on
        OffsetOutOfRange, per the configured `auto_offset_reset` policy

        Key configuration parameters:

        * `fetch_message_max_bytes`
        * `fetch_max_wait_ms`
        * `fetch_min_bytes`
        * `deserializer_class`
        * `auto_offset_reset`
        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        # Get current fetch offsets
        offsets = self._offsets.fetch
        if not offsets:
            if not self._topics:
                raise KafkaConfigurationError(
                    'No topics or partitions configured')
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages')

        fetches = []
        for topic_partition, offset in six.iteritems(offsets):
            fetches.append(
                FetchRequest(topic_partition[0], topic_partition[1], offset,
                             max_bytes))

        # client.send_fetch_request will collect topic/partition requests by leader
        # and send each group as a single FetchRequest to the correct broker
        try:
            responses = self._client.send_fetch_request(
                fetches,
                max_wait_time=max_wait_time,
                min_bytes=min_bytes,
                fail_on_error=False)
        except FailedPayloadsError:
            logger.warning(
                'FailedPayloadsError attempting to fetch data from kafka')
            self._refresh_metadata_on_error()
            return

        for resp in responses:
            topic_partition = (resp.topic, resp.partition)
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning(
                    'OffsetOutOfRange: topic %s, partition %d, offset %d '
                    '(Highwatermark: %d)', resp.topic, resp.partition,
                    offsets[topic_partition], resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[
                    topic_partition] = self._reset_partition_offset(
                        topic_partition)
                continue

            except NotLeaderForPartitionError:
                logger.warning(
                    "NotLeaderForPartitionError for %s - %d. "
                    "Metadata may be out of date", resp.topic, resp.partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d", resp.topic,
                               resp.partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[topic_partition] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                msg = KafkaMessage(
                    resp.topic, resp.partition, offset, message.key,
                    self._config['deserializer_class'](message.value))

                # Only increment fetch offset if we safely got the message and deserialized
                self._offsets.fetch[topic_partition] = offset + 1

                # Then yield to user
                yield msg
Пример #33
0
    def _fetch(self):
        # Create fetch request payloads for all the partitions
        partitions = dict((p, self.buffer_size)
                      for p in self.fetch_offsets.keys())
        while partitions:
            requests = []
            for partition, buffer_size in six.iteritems(partitions):
                requests.append(FetchRequest(self.topic, partition,
                                             self.fetch_offsets[partition],
                                             buffer_size))
            # Send request
            responses = self.client.send_fetch_request(
                requests,
                max_wait_time=int(self.fetch_max_wait_time),
                min_bytes=self.fetch_min_bytes,
                fail_on_error=False
            )

            retry_partitions = {}
            for resp in responses:

                try:
                    check_error(resp)
                except (UnknownTopicOrPartitionError, NotLeaderForPartitionError):
                    self.client.reset_topic_metadata(resp.topic)
                    raise
                except OffsetOutOfRangeError:
                    log.warning("OffsetOutOfRangeError for %s - %d. "
                                "Resetting partition offset...",
                                resp.topic, resp.partition)
                    self.reset_partition_offset(resp.partition)
                    # Retry this partition
                    retry_partitions[resp.partition] = partitions[resp.partition]
                    continue

                partition = resp.partition
                buffer_size = partitions[partition]
                try:
                    for message in resp.messages:
                        if message.offset < self.fetch_offsets[partition]:
                            log.debug('Skipping message %s because its offset is less than the consumer offset',
                                      message)
                            continue
                        # Put the message in our queue
                        self.queue.put((partition, message))
                        self.fetch_offsets[partition] = message.offset + 1
                except ConsumerFetchSizeTooSmall:
                    if (self.max_buffer_size is not None and
                            buffer_size == self.max_buffer_size):
                        log.error("Max fetch size %d too small",
                                  self.max_buffer_size)
                        raise
                    if self.max_buffer_size is None:
                        buffer_size *= 2
                    else:
                        buffer_size = min(buffer_size * 2,
                                          self.max_buffer_size)
                    log.warn("Fetch size too small, increase to %d (2x) "
                             "and retry", buffer_size)
                    retry_partitions[partition] = buffer_size
                except ConsumerNoMoreData as e:
                    log.debug("Iteration was ended by %r", e)
                except StopIteration:
                    # Stop iterating through this partition
                    log.debug("Done iterating over partition %s" % partition)
            partitions = retry_partitions
Пример #34
0
    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        # Get current fetch offsets
        offsets = self._offsets.fetch
        if not offsets:
            if not self._topics:
                raise KafkaConfigurationError('No topics or partitions configured')
            raise KafkaConfigurationError('No fetch offsets found when calling fetch_messages')

        fetches = []
        for topic_partition, offset in six.iteritems(offsets):
            fetches.append(FetchRequest(topic_partition[0], topic_partition[1], offset, max_bytes))

        # client.send_fetch_request will collect topic/partition requests by leader
        # and send each group as a single FetchRequest to the correct broker
        try:
            responses = self._client.send_fetch_request(fetches,
                                                        max_wait_time=max_wait_time,
                                                        min_bytes=min_bytes,
                                                        fail_on_error=False)
        except FailedPayloadsError:
            logger.warning('FailedPayloadsError attempting to fetch data from kafka')
            self._refresh_metadata_on_error()
            return

        for resp in responses:
            topic_partition = (resp.topic, resp.partition)
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning('OffsetOutOfRange: topic %s, partition %d, offset %d '
                               '(Highwatermark: %d)',
                               resp.topic, resp.partition,
                               offsets[topic_partition], resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[topic_partition] = self._reset_partition_offset(topic_partition)
                continue

            except NotLeaderForPartitionError:
                logger.warning("NotLeaderForPartitionError for %s - %d. "
                               "Metadata may be out of date",
                               resp.topic, resp.partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d",
                               resp.topic, resp.partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[topic_partition] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                msg = KafkaMessage(resp.topic,
                                   resp.partition,
                                   offset, message.key,
                                   self._config['deserializer_class'](message.value))

                if offset < self._offsets.fetch[topic_partition]:
                    logger.debug('Skipping message %s because its offset is less than the consumer offset',
                                 msg)
                    continue
                # Only increment fetch offset if we safely got the message and deserialized
                self._offsets.fetch[topic_partition] = offset + 1

                # Then yield to user
                yield msg