async def _verify_remaining(self, now: float, stats: RecoveryStatsMapping) -> None: consumer = self.app.consumer active_events_received_at = self._active_events_received_at recovery_started_at = self._recovery_started_at if recovery_started_at is None: return # we already log about this in _publish_stats secs_since_started = now - recovery_started_at last_flush_at = self._last_flush_at if last_flush_at is None: if secs_since_started >= self.flush_timeout_secs: self.log.warning( "Recovery has not flushed buffers since " "recovery startted (started %s). " "Current total buffer size: %r", humanize_seconds_ago(secs_since_started), self._current_total_buffer_size(), ) else: secs_since_last_flush = now - last_flush_at if secs_since_last_flush >= self.flush_timeout_secs: self.log.warning( "Recovery has not flushed buffers in the last %r " "seconds (last flush was %s). " "Current total buffer size: %r", self.flush_timeout_secs, humanize_seconds_ago(secs_since_last_flush), self._current_total_buffer_size(), ) for tp in stats: await self.sleep(0) if self.should_stop: break if not self.in_recovery: break consumer.verify_recovery_event_path(now, tp) secs_since_started = now - recovery_started_at last_event_received = active_events_received_at.get(tp) if last_event_received is None: if secs_since_started >= self.event_timeout_secs: self.log.warning( "No event received for active tp %r since recovery " "start (started %s)", tp, humanize_seconds_ago(secs_since_started), ) continue secs_since_received = now - last_event_received if secs_since_received >= self.event_timeout_secs: self.log.warning( "No event received for active tp %r in the last %r " "seconds (last event received %s)", tp, self.event_timeout_secs, humanize_seconds_ago(secs_since_received), )
def _verify_aiokafka_event_path(self, now: float, tp: TP) -> bool: """Verify that :pypi:`aiokafka` event path is working. Returns :const:`True` if any error was logged. """ parent = cast(Consumer, self.consumer) consumer = self._ensure_consumer() secs_since_started = now - self.time_started aiotp = parent._new_topicpartition(tp.topic, tp.partition) request_at = consumer.records_last_request.get(aiotp) if request_at is None: if secs_since_started >= self.tp_fetch_request_timeout_secs: # NO FETCH REQUEST SENT AT ALL SINCE WORKER START self.log.error( SLOW_PROCESSING_NO_FETCH_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return True response_at = consumer.records_last_response.get(aiotp) if response_at is None: if secs_since_started >= self.tp_fetch_response_timeout_secs: # NO FETCH RESPONSE RECEIVED AT ALL SINCE WORKER START self.log.error( SLOW_PROCESSING_NO_RESPONSE_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return True secs_since_request = now - request_at if secs_since_request >= self.tp_fetch_request_timeout_secs: # NO REQUEST SENT BY AIOKAFKA IN THE LAST n SECONDS self.log.error( SLOW_PROCESSING_NO_RECENT_FETCH, tp, humanize_seconds_ago(secs_since_request), ) return True secs_since_response = now - response_at if secs_since_response >= self.tp_fetch_response_timeout_secs: # NO RESPONSE RECEIVED FROM KAKFA IN THE LAST n SECONDS self.log.error( SLOW_PROCESSING_NO_RECENT_RESPONSE, tp, humanize_seconds_ago(secs_since_response), ) return True return False
def verify_event_path(self, now: float, tp: TP) -> None: # long function ahead, but not difficult to test # as it always returns as soon as some condition is met. parent = cast(Consumer, self.consumer) app = parent.app consumer = self._ensure_consumer() monitor = app.monitor acks_enabled_for = app.topics.acks_enabled_for aiotp = parent._new_topicpartition(tp.topic, tp.partition) secs_since_started = now - self.time_started request_at = consumer.records_last_request.get(aiotp) if request_at is None: if secs_since_started >= self.tp_fetch_request_timeout_secs: # NO FETCH REQUEST SENT AT ALL SINCE WORKER START self.log.error( SLOW_PROCESSING_NO_FETCH_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return None response_at = consumer.records_last_response.get(aiotp) if response_at is None: if secs_since_started >= self.tp_fetch_response_timeout_secs: # NO FETCH RESPONSE RECEIVED AT ALL SINCE WORKER START self.log.error( SLOW_PROCESSING_NO_RESPONSE_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return None secs_since_request = now - request_at if secs_since_request >= self.tp_fetch_request_timeout_secs: # NO REQUEST SENT BY AIOKAFKA IN THE LAST n SECONDS self.log.error( SLOW_PROCESSING_NO_RECENT_FETCH, tp, humanize_seconds_ago(secs_since_request), ) return None secs_since_response = now - response_at if secs_since_response >= self.tp_fetch_response_timeout_secs: # NO RESPONSE RECEIVED FROM KAKFA IN THE LAST n SECONDS self.log.error( SLOW_PROCESSING_NO_RECENT_RESPONSE, tp, humanize_seconds_ago(secs_since_response), ) return None if monitor is not None: # need for .stream_inbound_time highwater = self.highwater(tp) committed_offset = parent._committed_offset.get(tp) has_acks = acks_enabled_for(tp.topic) if highwater is None: if secs_since_started >= self.tp_stream_timeout_secs: # AIOKAFKA HAS NOT UPDATED HIGHWATER SINCE STARTING self.log.error( SLOW_PROCESSING_NO_HIGHWATER_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return None if has_acks and committed_offset is not None: if highwater > committed_offset: inbound = monitor.stream_inbound_time.get(tp) if inbound is None: if secs_since_started >= self.tp_stream_timeout_secs: # AIOKAFKA IS FETCHING BUT STREAM IS NOT # PROCESSING EVENTS (no events at all since # start). self._log_slow_processing_stream( SLOW_PROCESSING_STREAM_IDLE_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return None secs_since_stream = now - inbound if secs_since_stream >= self.tp_stream_timeout_secs: # AIOKAFKA IS FETCHING, AND STREAM WAS WORKING # BEFORE BUT NOW HAS STOPPED PROCESSING # (or processing of an event in the stream takes # longer than tp_stream_timeout_secs). self._log_slow_processing_stream( SLOW_PROCESSING_STREAM_IDLE, tp, humanize_seconds_ago(secs_since_stream), ) return None last_commit = self.tp_last_committed_at.get(tp) if last_commit is None: if secs_since_started >= self.tp_commit_timeout_secs: # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING # BUT WE HAVE NOT COMMITTED ANYTHING SINCE WORKER # START. self._log_slow_processing_commit( SLOW_PROCESSING_NO_COMMIT_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return None else: secs_since_commit = now - last_commit if secs_since_commit >= self.tp_commit_timeout_secs: # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING # BUT WE HAVE NOT COMITTED ANYTHING IN A WHILE # (commit offset is not advancing). self._log_slow_processing_commit( SLOW_PROCESSING_NO_RECENT_COMMIT, tp, humanize_seconds_ago(secs_since_commit), ) return None
def verify_event_path(self, now: float, tp: TP) -> None: # long function ahead, but not difficult to test # as it always returns as soon as some condition is met. if self._verify_aiokafka_event_path(now, tp): # already logged error. return None parent = cast(Consumer, self.consumer) app = parent.app monitor = app.monitor acks_enabled_for = app.topics.acks_enabled_for secs_since_started = now - self.time_started if monitor is not None: # need for .stream_inbound_time highwater = self.highwater(tp) committed_offset = parent._committed_offset.get(tp) has_acks = acks_enabled_for(tp.topic) if highwater is None: if secs_since_started >= self.tp_stream_timeout_secs: # AIOKAFKA HAS NOT UPDATED HIGHWATER SINCE STARTING self.log.error( SLOW_PROCESSING_NO_HIGHWATER_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return None if has_acks and committed_offset is not None: if highwater > committed_offset: inbound = monitor.stream_inbound_time.get(tp) if inbound is None: if secs_since_started >= self.tp_stream_timeout_secs: # AIOKAFKA IS FETCHING BUT STREAM IS NOT # PROCESSING EVENTS (no events at all since # start). self._log_slow_processing_stream( SLOW_PROCESSING_STREAM_IDLE_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return None secs_since_stream = now - inbound if secs_since_stream >= self.tp_stream_timeout_secs: # AIOKAFKA IS FETCHING, AND STREAM WAS WORKING # BEFORE BUT NOW HAS STOPPED PROCESSING # (or processing of an event in the stream takes # longer than tp_stream_timeout_secs). self._log_slow_processing_stream( SLOW_PROCESSING_STREAM_IDLE, tp, humanize_seconds_ago(secs_since_stream), ) return None last_commit = self.tp_last_committed_at.get(tp) if last_commit is None: if secs_since_started >= self.tp_commit_timeout_secs: # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING # BUT WE HAVE NOT COMMITTED ANYTHING SINCE WORKER # START. self._log_slow_processing_commit( SLOW_PROCESSING_NO_COMMIT_SINCE_START, tp, humanize_seconds_ago(secs_since_started), ) return None else: secs_since_commit = now - last_commit if secs_since_commit >= self.tp_commit_timeout_secs: # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING # BUT WE HAVE NOT COMITTED ANYTHING IN A WHILE # (commit offset is not advancing). self._log_slow_processing_commit( SLOW_PROCESSING_NO_RECENT_COMMIT, tp, humanize_seconds_ago(secs_since_commit), ) return None
def verify_event_path(self, now: float, tp: TP) -> None: # long function ahead, but not difficult to test # as it always returns as soon as some condition is met. parent = cast(Consumer, self.consumer) app = parent.app consumer = self._ensure_consumer() monitor = app.monitor acks_enabled_for = app.topics.acks_enabled_for aiotp = parent._new_topicpartition(tp.topic, tp.partition) secs_since_started = now - self.time_started request_at = consumer.records_last_request.get(aiotp) if request_at is None: if secs_since_started >= self.tp_fetch_request_timeout_secs: # NO FETCH REQUEST SENT AT ALL SINCE WORKER START self.log.error( 'Aiokafka has not sent fetch request for %r since start ', '(started %s)', tp, humanize_seconds_ago(secs_since_started), ) return None response_at = consumer.records_last_response.get(aiotp) if response_at is None: if secs_since_started >= self.tp_fetch_response_timeout_secs: # NO FETCH RESPONSE RECEIVED AT ALL SINCE WORKER START self.log.error( 'Aiokafka has not received fetch response for %r since ' 'start (started %s)', tp, humanize_seconds_ago(secs_since_started), ) return None secs_since_request = now - request_at if secs_since_request >= self.tp_fetch_request_timeout_secs: # NO REQUEST SENT BY AIOKAFKA IN THE LAST n SECONDS self.log.error( 'Aiokafka stopped fetching from %r (last done %s)', tp, humanize_seconds_ago(secs_since_request), ) return None secs_since_response = now - response_at if secs_since_response >= self.tp_fetch_response_timeout_secs: # NO RESPONSE RECEIVED FROM KAKFA IN THE LAST n SECONDS self.log.error( 'Broker stopped responding to fetch requests for %r ' '(last responded %s)', tp, humanize_seconds_ago(secs_since_response), ) return None if monitor is not None: # need for .stream_inbound_time highwater = self.highwater(tp) committed_offset = parent._committed_offset.get(tp) has_acks = acks_enabled_for(tp.topic) if highwater is None: if secs_since_started >= self.tp_stream_timeout_secs: # AIOKAFKA HAS NOT UPDATED HIGHWATER SINCE STARTING self.log.error( 'Highwater not yet available for %r ', '(started %s)', tp, humanize_seconds_ago(secs_since_started), ) return None if has_acks and committed_offset is not None: if highwater > committed_offset: inbound = monitor.stream_inbound_time.get(tp) if inbound is None: if secs_since_started >= self.tp_stream_timeout_secs: # AIOKAFKA IS FETCHING BUT STREAM IS NOT # PROCESSING EVENTS (no events at all since # start). self._log_slow_processing_stream( 'Stream has not started processing %r ', '(started %s)', tp, humanize_seconds_ago(secs_since_started), ) return None secs_since_stream = now - inbound if secs_since_stream >= self.tp_stream_timeout_secs: # AIOKAFKA IS FETCHING, AND STREAM WAS WORKING # BEFORE BUT NOW HAS STOPPED PROCESSING # (or processing of an event in the stream takes # longer than tp_stream_timeout_secs). self._log_slow_processing_stream( 'Stream stopped processing, or is slow for %r ' '(last inbound %s)', tp, humanize_seconds_ago(secs_since_stream), ) return None last_commit = self.tp_last_committed_at.get(tp) if last_commit is None: if secs_since_started >= self.tp_commit_timeout_secs: # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING # BUT WE HAVE NOT COMMITTED ANYTHING SINCE WORKER # START. self._log_slow_processing_commit( 'Has not committed %r at all since worker ' 'start (started %s)', tp, humanize_seconds_ago(secs_since_started), ) return None else: secs_since_commit = now - last_commit if secs_since_commit >= self.tp_commit_timeout_secs: # AIOKAFKA IS FETCHING AND STREAM IS PROCESSING # BUT WE HAVE NOT COMITTED ANYTHING IN A WHILE # (commit offset is not advancing). self._log_slow_processing_commit( 'Has not committed %r (last commit %s).', 'broker_commit_livelock_soft_timeout', tp, humanize_seconds_ago(secs_since_commit), ) return None