示例#1
0
    async def _verify_remaining(self, now: float,
                                stats: RecoveryStatsMapping) -> None:
        consumer = self.app.consumer
        active_events_received_at = self._active_events_received_at
        recovery_started_at = self._recovery_started_at
        if recovery_started_at is None:
            return  # we already log about this in _publish_stats
        secs_since_started = now - recovery_started_at

        last_flush_at = self._last_flush_at
        if last_flush_at is None:
            if secs_since_started >= self.flush_timeout_secs:
                self.log.warning(
                    "Recovery has not flushed buffers since "
                    "recovery startted (started %s). "
                    "Current total buffer size: %r",
                    humanize_seconds_ago(secs_since_started),
                    self._current_total_buffer_size(),
                )
        else:
            secs_since_last_flush = now - last_flush_at
            if secs_since_last_flush >= self.flush_timeout_secs:
                self.log.warning(
                    "Recovery has not flushed buffers in the last %r "
                    "seconds (last flush was %s). "
                    "Current total buffer size: %r",
                    self.flush_timeout_secs,
                    humanize_seconds_ago(secs_since_last_flush),
                    self._current_total_buffer_size(),
                )

        for tp in stats:
            await self.sleep(0)
            if self.should_stop:
                break
            if not self.in_recovery:
                break
            consumer.verify_recovery_event_path(now, tp)
            secs_since_started = now - recovery_started_at

            last_event_received = active_events_received_at.get(tp)
            if last_event_received is None:
                if secs_since_started >= self.event_timeout_secs:
                    self.log.warning(
                        "No event received for active tp %r since recovery "
                        "start (started %s)",
                        tp,
                        humanize_seconds_ago(secs_since_started),
                    )
                continue

            secs_since_received = now - last_event_received
            if secs_since_received >= self.event_timeout_secs:
                self.log.warning(
                    "No event received for active tp %r in the last %r "
                    "seconds (last event received %s)",
                    tp,
                    self.event_timeout_secs,
                    humanize_seconds_ago(secs_since_received),
                )
示例#2
0
    def _verify_aiokafka_event_path(self, now: float, tp: TP) -> bool:
        """Verify that :pypi:`aiokafka` event path is working.

        Returns :const:`True` if any error was logged.
        """
        parent = cast(Consumer, self.consumer)
        consumer = self._ensure_consumer()
        secs_since_started = now - self.time_started
        aiotp = parent._new_topicpartition(tp.topic, tp.partition)

        request_at = consumer.records_last_request.get(aiotp)
        if request_at is None:
            if secs_since_started >= self.tp_fetch_request_timeout_secs:
                # NO FETCH REQUEST SENT AT ALL SINCE WORKER START
                self.log.error(
                    SLOW_PROCESSING_NO_FETCH_SINCE_START,
                    tp,
                    humanize_seconds_ago(secs_since_started),
                )
            return True

        response_at = consumer.records_last_response.get(aiotp)
        if response_at is None:
            if secs_since_started >= self.tp_fetch_response_timeout_secs:
                # NO FETCH RESPONSE RECEIVED AT ALL SINCE WORKER START
                self.log.error(
                    SLOW_PROCESSING_NO_RESPONSE_SINCE_START,
                    tp,
                    humanize_seconds_ago(secs_since_started),
                )
            return True

        secs_since_request = now - request_at
        if secs_since_request >= self.tp_fetch_request_timeout_secs:
            # NO REQUEST SENT BY AIOKAFKA IN THE LAST n SECONDS
            self.log.error(
                SLOW_PROCESSING_NO_RECENT_FETCH,
                tp,
                humanize_seconds_ago(secs_since_request),
            )
            return True

        secs_since_response = now - response_at
        if secs_since_response >= self.tp_fetch_response_timeout_secs:
            # NO RESPONSE RECEIVED FROM KAKFA IN THE LAST n SECONDS
            self.log.error(
                SLOW_PROCESSING_NO_RECENT_RESPONSE,
                tp,
                humanize_seconds_ago(secs_since_response),
            )
            return True
        return False
示例#3
0
    def verify_event_path(self, now: float, tp: TP) -> None:
        # long function ahead, but not difficult to test
        # as it always returns as soon as some condition is met.

        parent = cast(Consumer, self.consumer)
        app = parent.app
        consumer = self._ensure_consumer()
        monitor = app.monitor
        acks_enabled_for = app.topics.acks_enabled_for
        aiotp = parent._new_topicpartition(tp.topic, tp.partition)
        secs_since_started = now - self.time_started

        request_at = consumer.records_last_request.get(aiotp)
        if request_at is None:
            if secs_since_started >= self.tp_fetch_request_timeout_secs:
                # NO FETCH REQUEST SENT AT ALL SINCE WORKER START
                self.log.error(
                    SLOW_PROCESSING_NO_FETCH_SINCE_START,
                    tp,
                    humanize_seconds_ago(secs_since_started),
                )
            return None

        response_at = consumer.records_last_response.get(aiotp)
        if response_at is None:
            if secs_since_started >= self.tp_fetch_response_timeout_secs:
                # NO FETCH RESPONSE RECEIVED AT ALL SINCE WORKER START
                self.log.error(
                    SLOW_PROCESSING_NO_RESPONSE_SINCE_START,
                    tp,
                    humanize_seconds_ago(secs_since_started),
                )
            return None

        secs_since_request = now - request_at
        if secs_since_request >= self.tp_fetch_request_timeout_secs:
            # NO REQUEST SENT BY AIOKAFKA IN THE LAST n SECONDS
            self.log.error(
                SLOW_PROCESSING_NO_RECENT_FETCH,
                tp,
                humanize_seconds_ago(secs_since_request),
            )
            return None

        secs_since_response = now - response_at
        if secs_since_response >= self.tp_fetch_response_timeout_secs:
            # NO RESPONSE RECEIVED FROM KAKFA IN THE LAST n SECONDS
            self.log.error(
                SLOW_PROCESSING_NO_RECENT_RESPONSE,
                tp,
                humanize_seconds_ago(secs_since_response),
            )
            return None

        if monitor is not None:  # need for .stream_inbound_time
            highwater = self.highwater(tp)
            committed_offset = parent._committed_offset.get(tp)
            has_acks = acks_enabled_for(tp.topic)
            if highwater is None:
                if secs_since_started >= self.tp_stream_timeout_secs:
                    # AIOKAFKA HAS NOT UPDATED HIGHWATER SINCE STARTING
                    self.log.error(
                        SLOW_PROCESSING_NO_HIGHWATER_SINCE_START,
                        tp,
                        humanize_seconds_ago(secs_since_started),
                    )
                return None

            if has_acks and committed_offset is not None:
                if highwater > committed_offset:
                    inbound = monitor.stream_inbound_time.get(tp)
                    if inbound is None:
                        if secs_since_started >= self.tp_stream_timeout_secs:
                            # AIOKAFKA IS FETCHING BUT STREAM IS NOT
                            # PROCESSING EVENTS (no events at all since
                            # start).
                            self._log_slow_processing_stream(
                                SLOW_PROCESSING_STREAM_IDLE_SINCE_START,
                                tp,
                                humanize_seconds_ago(secs_since_started),
                            )
                        return None

                    secs_since_stream = now - inbound
                    if secs_since_stream >= self.tp_stream_timeout_secs:
                        # AIOKAFKA IS FETCHING, AND STREAM WAS WORKING
                        # BEFORE BUT NOW HAS STOPPED PROCESSING
                        # (or processing of an event in the stream takes
                        #  longer than tp_stream_timeout_secs).
                        self._log_slow_processing_stream(
                            SLOW_PROCESSING_STREAM_IDLE,
                            tp,
                            humanize_seconds_ago(secs_since_stream),
                        )
                        return None

                    last_commit = self.tp_last_committed_at.get(tp)
                    if last_commit is None:
                        if secs_since_started >= self.tp_commit_timeout_secs:
                            # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING
                            # BUT WE HAVE NOT COMMITTED ANYTHING SINCE WORKER
                            # START.
                            self._log_slow_processing_commit(
                                SLOW_PROCESSING_NO_COMMIT_SINCE_START,
                                tp,
                                humanize_seconds_ago(secs_since_started),
                            )
                            return None
                    else:
                        secs_since_commit = now - last_commit
                        if secs_since_commit >= self.tp_commit_timeout_secs:
                            # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING
                            # BUT WE HAVE NOT COMITTED ANYTHING IN A WHILE
                            # (commit offset is not advancing).
                            self._log_slow_processing_commit(
                                SLOW_PROCESSING_NO_RECENT_COMMIT,
                                tp,
                                humanize_seconds_ago(secs_since_commit),
                            )
                            return None
示例#4
0
    def verify_event_path(self, now: float, tp: TP) -> None:
        # long function ahead, but not difficult to test
        # as it always returns as soon as some condition is met.
        if self._verify_aiokafka_event_path(now, tp):
            # already logged error.
            return None
        parent = cast(Consumer, self.consumer)
        app = parent.app
        monitor = app.monitor
        acks_enabled_for = app.topics.acks_enabled_for
        secs_since_started = now - self.time_started

        if monitor is not None:  # need for .stream_inbound_time
            highwater = self.highwater(tp)
            committed_offset = parent._committed_offset.get(tp)
            has_acks = acks_enabled_for(tp.topic)
            if highwater is None:
                if secs_since_started >= self.tp_stream_timeout_secs:
                    # AIOKAFKA HAS NOT UPDATED HIGHWATER SINCE STARTING
                    self.log.error(
                        SLOW_PROCESSING_NO_HIGHWATER_SINCE_START,
                        tp,
                        humanize_seconds_ago(secs_since_started),
                    )
                return None

            if has_acks and committed_offset is not None:
                if highwater > committed_offset:
                    inbound = monitor.stream_inbound_time.get(tp)
                    if inbound is None:
                        if secs_since_started >= self.tp_stream_timeout_secs:
                            # AIOKAFKA IS FETCHING BUT STREAM IS NOT
                            # PROCESSING EVENTS (no events at all since
                            # start).
                            self._log_slow_processing_stream(
                                SLOW_PROCESSING_STREAM_IDLE_SINCE_START,
                                tp,
                                humanize_seconds_ago(secs_since_started),
                            )
                        return None

                    secs_since_stream = now - inbound
                    if secs_since_stream >= self.tp_stream_timeout_secs:
                        # AIOKAFKA IS FETCHING, AND STREAM WAS WORKING
                        # BEFORE BUT NOW HAS STOPPED PROCESSING
                        # (or processing of an event in the stream takes
                        #  longer than tp_stream_timeout_secs).
                        self._log_slow_processing_stream(
                            SLOW_PROCESSING_STREAM_IDLE,
                            tp,
                            humanize_seconds_ago(secs_since_stream),
                        )
                        return None

                    last_commit = self.tp_last_committed_at.get(tp)
                    if last_commit is None:
                        if secs_since_started >= self.tp_commit_timeout_secs:
                            # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING
                            # BUT WE HAVE NOT COMMITTED ANYTHING SINCE WORKER
                            # START.
                            self._log_slow_processing_commit(
                                SLOW_PROCESSING_NO_COMMIT_SINCE_START,
                                tp,
                                humanize_seconds_ago(secs_since_started),
                            )
                            return None
                    else:
                        secs_since_commit = now - last_commit
                        if secs_since_commit >= self.tp_commit_timeout_secs:
                            # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING
                            # BUT WE HAVE NOT COMITTED ANYTHING IN A WHILE
                            # (commit offset is not advancing).
                            self._log_slow_processing_commit(
                                SLOW_PROCESSING_NO_RECENT_COMMIT,
                                tp,
                                humanize_seconds_ago(secs_since_commit),
                            )
                            return None
示例#5
0
    def verify_event_path(self, now: float, tp: TP) -> None:
        # long function ahead, but not difficult to test
        # as it always returns as soon as some condition is met.

        parent = cast(Consumer, self.consumer)
        app = parent.app
        consumer = self._ensure_consumer()
        monitor = app.monitor
        acks_enabled_for = app.topics.acks_enabled_for
        aiotp = parent._new_topicpartition(tp.topic, tp.partition)
        secs_since_started = now - self.time_started

        request_at = consumer.records_last_request.get(aiotp)
        if request_at is None:
            if secs_since_started >= self.tp_fetch_request_timeout_secs:
                # NO FETCH REQUEST SENT AT ALL SINCE WORKER START
                self.log.error(
                    'Aiokafka has not sent fetch request for %r since start ',
                    '(started %s)',
                    tp,
                    humanize_seconds_ago(secs_since_started),
                )
            return None

        response_at = consumer.records_last_response.get(aiotp)
        if response_at is None:
            if secs_since_started >= self.tp_fetch_response_timeout_secs:
                # NO FETCH RESPONSE RECEIVED AT ALL SINCE WORKER START
                self.log.error(
                    'Aiokafka has not received fetch response for %r since '
                    'start (started %s)',
                    tp,
                    humanize_seconds_ago(secs_since_started),
                )
            return None

        secs_since_request = now - request_at
        if secs_since_request >= self.tp_fetch_request_timeout_secs:
            # NO REQUEST SENT BY AIOKAFKA IN THE LAST n SECONDS
            self.log.error(
                'Aiokafka stopped fetching from %r (last done %s)',
                tp,
                humanize_seconds_ago(secs_since_request),
            )
            return None

        secs_since_response = now - response_at
        if secs_since_response >= self.tp_fetch_response_timeout_secs:
            # NO RESPONSE RECEIVED FROM KAKFA IN THE LAST n SECONDS
            self.log.error(
                'Broker stopped responding to fetch requests for %r '
                '(last responded %s)',
                tp,
                humanize_seconds_ago(secs_since_response),
            )
            return None

        if monitor is not None:  # need for .stream_inbound_time
            highwater = self.highwater(tp)
            committed_offset = parent._committed_offset.get(tp)
            has_acks = acks_enabled_for(tp.topic)
            if highwater is None:
                if secs_since_started >= self.tp_stream_timeout_secs:
                    # AIOKAFKA HAS NOT UPDATED HIGHWATER SINCE STARTING
                    self.log.error(
                        'Highwater not yet available for %r ',
                        '(started %s)',
                        tp,
                        humanize_seconds_ago(secs_since_started),
                    )
                return None

            if has_acks and committed_offset is not None:
                if highwater > committed_offset:
                    inbound = monitor.stream_inbound_time.get(tp)
                    if inbound is None:
                        if secs_since_started >= self.tp_stream_timeout_secs:
                            # AIOKAFKA IS FETCHING BUT STREAM IS NOT
                            # PROCESSING EVENTS (no events at all since
                            # start).
                            self._log_slow_processing_stream(
                                'Stream has not started processing %r ',
                                '(started %s)',
                                tp,
                                humanize_seconds_ago(secs_since_started),
                            )
                        return None

                    secs_since_stream = now - inbound
                    if secs_since_stream >= self.tp_stream_timeout_secs:
                        # AIOKAFKA IS FETCHING, AND STREAM WAS WORKING
                        # BEFORE BUT NOW HAS STOPPED PROCESSING
                        # (or processing of an event in the stream takes
                        #  longer than tp_stream_timeout_secs).
                        self._log_slow_processing_stream(
                            'Stream stopped processing, or is slow for %r '
                            '(last inbound %s)',
                            tp,
                            humanize_seconds_ago(secs_since_stream),
                        )
                        return None

                    last_commit = self.tp_last_committed_at.get(tp)
                    if last_commit is None:
                        if secs_since_started >= self.tp_commit_timeout_secs:
                            # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING
                            # BUT WE HAVE NOT COMMITTED ANYTHING SINCE WORKER
                            # START.
                            self._log_slow_processing_commit(
                                'Has not committed %r at all since worker '
                                'start (started %s)',
                                tp,
                                humanize_seconds_ago(secs_since_started),
                            )
                            return None
                    else:
                        secs_since_commit = now - last_commit
                        if secs_since_commit >= self.tp_commit_timeout_secs:
                            # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING
                            # BUT WE HAVE NOT COMITTED ANYTHING IN A WHILE
                            # (commit offset is not advancing).
                            self._log_slow_processing_commit(
                                'Has not committed %r (last commit %s).',
                                'broker_commit_livelock_soft_timeout',
                                tp,
                                humanize_seconds_ago(secs_since_commit),
                            )
                            return None