def update_metadata(self, metadata): # In the common case where we ask for a single topic and get back an # error, we should fail the future if len(metadata.topics) == 1 and metadata.topics[0][0] != 0: error_code, topic, _ = metadata.topics[0] error = Errors.for_code(error_code)(topic) return self.failed_update(error) if not metadata.brokers: log.warning("No broker metadata found in MetadataResponse") for node_id, host, port in metadata.brokers: self._brokers.update({ node_id: BrokerMetadata(node_id, host, port) }) # Drop any UnknownTopic, InvalidTopic, and TopicAuthorizationFailed # but retain LeaderNotAvailable because it means topic is initializing self._partitions.clear() self._broker_partitions.clear() for error_code, topic, partitions in metadata.topics: error_type = Errors.for_code(error_code) if error_type is Errors.NoError: self._partitions[topic] = {} for _, partition, leader, _, _ in partitions: self._partitions[topic][partition] = leader if leader != -1: self._broker_partitions[leader].add(TopicPartition(topic, partition)) elif error_type is Errors.LeaderNotAvailableError: log.error("Topic %s is not available during auto-create" " initialization", topic) elif error_type is Errors.UnknownTopicOrPartitionError: log.error("Topic %s not found in cluster metadata", topic) elif error_type is Errors.TopicAuthorizationFailedError: log.error("Topic %s is not authorized for this client", topic) elif error_type is Errors.InvalidTopicError: log.error("'%s' is not a valid topic name", topic) else: log.error("Error fetching metadata for topic %s: %s", topic, error_type) if self._future: self._future.success(self) self._future = None self._need_update = False self._version += 1 now = time.time() * 1000 self._last_refresh_ms = now self._last_successful_refresh_ms = now log.debug("Updated cluster metadata version %d to %s", self._version, self) for listener in self._listeners: listener(self)
def _handle_heartbeat_response(self, future, response): #self.sensors.heartbeat_latency.record(response.requestLatencyMs()) error_type = Errors.for_code(response.error_code) if error_type is Errors.NoError: log.debug("Received successful heartbeat response.") future.success(None) elif error_type in (Errors.GroupCoordinatorNotAvailableError, Errors.NotCoordinatorForGroupError): log.info("Heartbeat failed: coordinator is either not started or" " not valid; will refresh metadata and retry") self.coordinator_dead() future.failure(error_type()) elif error_type is Errors.RebalanceInProgressError: log.info("Heartbeat failed: group is rebalancing; re-joining group") self.rejoin_needed = True future.failure(error_type()) elif error_type is Errors.IllegalGenerationError: log.info("Heartbeat failed: local generation id is not current;" " re-joining group") self.rejoin_needed = True future.failure(error_type()) elif error_type is Errors.UnknownMemberIdError: log.info("Heartbeat failed: local member_id was not recognized;" " resetting and re-joining group") self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID self.rejoin_needed = True future.failure(error_type) elif error_type is Errors.GroupAuthorizationFailedError: error = error_type(self.group_id) log.error("Heartbeat failed: authorization error: %s", error) future.failure(error) else: error = error_type() log.error("Heartbeat failed: Unhandled error: %s", error) future.failure(error)
def _handle_offset_fetch_response(self, future, response): offsets = {} for topic, partitions in response.topics: for partition, offset, metadata, error_code in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if error_type is not Errors.NoError: error = error_type() log.debug("Error fetching offset for %s: %s", tp, error_type()) if error_type is Errors.GroupLoadInProgressError: # just retry future.failure(error) elif error_type is Errors.NotCoordinatorForGroupError: # re-discover the coordinator and retry self.coordinator_dead() future.failure(error) elif error_type in (Errors.UnknownMemberIdError, Errors.IllegalGenerationError): # need to re-join group self._subscription.mark_for_reassignment() future.failure(error) elif error_type is Errors.UnknownTopicOrPartitionError: log.warning("OffsetFetchRequest -- unknown topic %s", topic) continue else: log.error("Unknown error fetching offsets for %s: %s", tp, error) future.failure(error) return elif offset >= 0: # record the position with the offset (-1 indicates no committed offset to fetch) offsets[tp] = OffsetAndMetadata(offset, metadata) else: log.debug("No committed offset for partition %s", tp) future.success(offsets)
def _handle_sync_group_response(self, future, response): error_type = Errors.for_code(response.error_code) if error_type is Errors.NoError: log.debug("Received successful sync group response for group %s: %s", self.group_id, response) #self.sensors.syncLatency.record(response.requestLatencyMs()) future.success(response.member_assignment) return # Always rejoin on error self.rejoin_needed = True if error_type is Errors.GroupAuthorizationFailedError: future.failure(error_type(self.group_id)) elif error_type is Errors.RebalanceInProgressError: log.info("SyncGroup for group %s failed due to coordinator" " rebalance, rejoining the group", self.group_id) future.failure(error_type(self.group_id)) elif error_type in (Errors.UnknownMemberIdError, Errors.IllegalGenerationError): error = error_type() log.info("SyncGroup for group %s failed due to %s," " rejoining the group", self.group_id, error) self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID future.failure(error) elif error_type in (Errors.GroupCoordinatorNotAvailableError, Errors.NotCoordinatorForGroupError): error = error_type() log.info("SyncGroup for group %s failed due to %s, will find new" " coordinator and rejoin", self.group_id, error) self.coordinator_dead() future.failure(error) else: error = error_type() log.error("Unexpected error from SyncGroup: %s", error) future.failure(error)
def _handle_offset_response(self, partition, future, response): """Callback for the response of the list offset call above. Arguments: partition (TopicPartition): The partition that was fetched future (Future): the future to update based on response response (OffsetResponse): response from the server Raises: AssertionError: if response does not match partition """ topic, partition_info = response.topics[0] assert len(response.topics) == 1 and len(partition_info) == 1, ( 'OffsetResponse should only be for a single topic-partition') part, error_code, offsets = partition_info[0] assert topic == partition.topic and part == partition.partition, ( 'OffsetResponse partition does not match OffsetRequest partition') error_type = Errors.for_code(error_code) if error_type is Errors.NoError: assert len(offsets) == 1, 'Expected OffsetResponse with one offset' offset = offsets[0] log.debug("Fetched offset %d for partition %s", offset, partition) future.success(offset) elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): log.warning("Attempt to fetch offsets for partition %s failed due" " to obsolete leadership information, retrying.", partition) future.failure(error_type(partition)) else: log.error("Attempt to fetch offsets for partition %s failed due to:" " %s", partition, error_type) future.failure(error_type(partition))
def _proc_offset_request(self, partition, timestamp): """Fetch a single offset before the given timestamp for the partition. Arguments: partition (TopicPartition): partition that needs fetching offset timestamp (int): timestamp for fetching offset Returns: Future: resolves to the corresponding offset """ node_id = self._client.cluster.leader_for_partition(partition) if node_id is None: log.debug("Partition %s is unknown for fetching offset," " wait for metadata refresh", partition) raise Errors.StaleMetadata(partition) elif node_id == -1: log.debug( "Leader for partition %s unavailable for fetching offset," " wait for metadata refresh", partition) raise Errors.LeaderNotAvailableError(partition) request = OffsetRequest( -1, [(partition.topic, [(partition.partition, timestamp, 1)])] ) if not (yield from self._client.ready(node_id)): raise Errors.NodeNotReadyError(node_id) response = yield from self._client.send(node_id, request) topic, partition_info = response.topics[0] assert len(response.topics) == 1 and len(partition_info) == 1, ( 'OffsetResponse should only be for a single topic-partition') part, error_code, offsets = partition_info[0] assert topic == partition.topic and part == partition.partition, ( 'OffsetResponse partition does not match OffsetRequest partition') error_type = Errors.for_code(error_code) if error_type is Errors.NoError: if not offsets: return -1 assert len(offsets) == 1, 'Expected OffsetResponse with one offset' offset = offsets[0] log.debug("Fetched offset %d for partition %s", offset, partition) return offset elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): log.warning("Attempt to fetch offsets for partition %s failed due" " to obsolete leadership information, retrying.", partition) raise error_type(partition) else: log.error( "Attempt to fetch offsets for partition %s failed due to:" " %s", partition, error_type) raise error_type(partition)
def _handle_join_group_response(self, future, response): error_type = Errors.for_code(response.error_code) if error_type is Errors.NoError: self.member_id = response.member_id self.generation = response.generation_id self.rejoin_needed = False self.protocol = response.group_protocol log.info("Joined group '%s' (generation %s) with member_id %s", self.group_id, self.generation, self.member_id) #self.sensors.join_latency.record(response.requestLatencyMs()) if response.leader_id == response.member_id: log.info("Elected group leader -- performing partition" " assignments using %s", self.protocol) self._on_join_leader(response).chain(future) else: self._on_join_follower().chain(future) elif error_type is Errors.GroupLoadInProgressError: log.debug("Attempt to join group %s rejected since coordinator is" " loading the group.", self.group_id) # backoff and retry future.failure(error_type(response)) elif error_type is Errors.UnknownMemberIdError: # reset the member id and retry immediately error = error_type(self.member_id) self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID log.info("Attempt to join group %s failed due to unknown member id," " resetting and retrying.", self.group_id) future.failure(error) elif error_type in (Errors.GroupCoordinatorNotAvailableError, Errors.NotCoordinatorForGroupError): # re-discover the coordinator and retry with backoff self.coordinator_dead() log.info("Attempt to join group %s failed due to obsolete " "coordinator information, retrying.", self.group_id) future.failure(error_type()) elif error_type in (Errors.InconsistentGroupProtocolError, Errors.InvalidSessionTimeoutError, Errors.InvalidGroupIdError): # log the error and re-throw the exception error = error_type(response) log.error("Attempt to join group %s failed due to: %s", self.group_id, error) future.failure(error) elif error_type is Errors.GroupAuthorizationFailedError: future.failure(error_type(self.group_id)) else: # unexpected error, throw the exception error = error_type() log.error("Unexpected error in join group response: %s", error) future.failure(error)
def _handle_produce_response(self, batches, response): """Handle a produce response.""" # if we have a response, parse it log.debug('Parsing produce response: %r', response) if response: batches_by_partition = dict( [(batch.topic_partition, batch) for batch in batches]) for topic, partitions in response.topics: for partition, error_code, offset in partitions: tp = TopicPartition(topic, partition) error = Errors.for_code(error_code) batch = batches_by_partition[tp] self._complete_batch(batch, error, offset) else: # this is the acks = 0 case, just complete all requests for batch in batches: self._complete_batch(batch, None, -1)
def _send_req(self, node_id, request): """send request to Kafka node and mark coordinator as `dead` in error case """ try: resp = yield from self._client.send(node_id, request) except Errors.KafkaError as err: log.error( 'Error sending %s to node %s [%s] -- marking coordinator dead', request.__class__.__name__, node_id, err) self.coordinator_dead() raise err else: if not hasattr(resp, 'error_code'): return resp error_type = Errors.for_code(resp.error_code) if error_type is Errors.NoError: return resp else: raise error_type()
def add_group_coordinator(self, group, response): """Update with metadata for a group coordinator Arguments: group (str): name of group from GroupCoordinatorRequest response (GroupCoordinatorResponse): broker response Returns: bool: True if metadata is updated, False on error """ log.debug("Updating coordinator for %s: %s", group, response) error_type = Errors.for_code(response.error_code) if error_type is not Errors.NoError: log.error("GroupCoordinatorResponse error: %s", error_type) self._groups[group] = -1 return False node_id = response.coordinator_id coordinator = BrokerMetadata( response.coordinator_id, response.host, response.port) # Assume that group coordinators are just brokers # (this is true now, but could diverge in future) if node_id not in self._brokers: self._brokers[node_id] = coordinator # If this happens, either brokers have moved without # changing IDs, or our assumption above is wrong elif coordinator != self._brokers[node_id]: log.error("GroupCoordinator metadata conflicts with existing" " broker metadata. Coordinator: %s, Broker: %s", coordinator, self._brokers[node_id]) self._groups[group] = node_id return False log.info("Group coordinator for %s is %s", group, coordinator) self._groups[group] = node_id return True
def _handle_group_coordinator_response(self, future, response): log.debug("Group metadata response %s", response) if not self.coordinator_unknown(): # We already found the coordinator, so ignore the request log.debug("Coordinator already known -- ignoring metadata response") future.success(self.coordinator_id) return error_type = Errors.for_code(response.error_code) if error_type is Errors.NoError: ok = self._client.cluster.add_group_coordinator(self.group_id, response) if not ok: # This could happen if coordinator metadata is different # than broker metadata future.failure(Errors.IllegalStateError()) return self.coordinator_id = response.coordinator_id self._client.ready(self.coordinator_id) # start sending heartbeats only if we have a valid generation if self.generation > 0: self.heartbeat_task.reset() future.success(self.coordinator_id) elif error_type is Errors.GroupCoordinatorNotAvailableError: log.debug("Group Coordinator Not Available; retry") future.failure(error_type()) elif error_type is Errors.GroupAuthorizationFailedError: error = error_type(self.group_id) log.error("Group Coordinator Request failed: %s", error) future.failure(error) else: error = error_type() log.error("Unrecognized failure in Group Coordinator Request: %s", error) future.failure(error)
def commit_offsets(self, offsets): """Commit specific offsets asynchronously. Arguments: offsets (dict {TopicPartition: OffsetAndMetadata}): what to commit Raises error on failure """ self._subscription.needs_fetch_committed_offsets = True if not offsets: log.debug('No offsets to commit') return True if (yield from self.coordinator_unknown()): raise Errors.GroupCoordinatorNotAvailableError() node_id = self.coordinator_id # create the offset commit request offset_data = collections.defaultdict(list) for tp, offset in offsets.items(): offset_data[tp.topic].append( (tp.partition, offset.offset, offset.metadata)) request = OffsetCommitRequest( self.group_id, self.generation, self.member_id, OffsetCommitRequest.DEFAULT_RETENTION_TIME, [(topic, tp_offsets) for topic, tp_offsets in offset_data.items()] ) log.debug( "Sending offset-commit request with %s to %s", offsets, node_id) response = yield from self._send_req(node_id, request) unauthorized_topics = set() for topic, partitions in response.topics: for partition, error_code in partitions: tp = TopicPartition(topic, partition) offset = offsets[tp] error_type = Errors.for_code(error_code) if error_type is Errors.NoError: log.debug( "Committed offset %s for partition %s", offset, tp) if self._subscription.is_assigned(tp): partition = self._subscription.assignment[tp] partition.committed = offset.offset elif error_type is Errors.GroupAuthorizationFailedError: log.error("OffsetCommit failed for group %s - %s", self.group_id, error_type.__name__) raise error_type() elif error_type is Errors.TopicAuthorizationFailedError: unauthorized_topics.add(topic) elif error_type in (Errors.OffsetMetadataTooLargeError, Errors.InvalidCommitOffsetSizeError): # raise the error to the user log.info( "OffsetCommit failed for group %s on partition %s" " due to %s, will retry", self.group_id, tp, error_type.__name__) raise error_type() elif error_type is Errors.GroupLoadInProgressError: # just retry log.info( "OffsetCommit failed for group %s because group is" " initializing (%s), will retry", self.group_id, error_type.__name__) raise error_type() elif error_type in (Errors.GroupCoordinatorNotAvailableError, Errors.NotCoordinatorForGroupError, Errors.RequestTimedOutError): log.info( "OffsetCommit failed for group %s due to a" " coordinator error (%s), will find new coordinator" " and retry", self.group_id, error_type.__name__) self.coordinator_dead() raise error_type() elif error_type in (Errors.UnknownMemberIdError, Errors.IllegalGenerationError, Errors.RebalanceInProgressError): # need to re-join group error = error_type(self.group_id) log.error( "OffsetCommit failed for group %s due to group" " error (%s), will rejoin", self.group_id, error) self._subscription.mark_for_reassignment() raise error else: log.error( "OffsetCommit failed for group %s on partition %s" " with offset %s: %s", self.group_id, tp, offset, error_type.__name__) raise error_type() if unauthorized_topics: log.error("OffsetCommit failed for unauthorized topics %s", unauthorized_topics) raise Errors.TopicAuthorizationFailedError(unauthorized_topics)
def _proc_fetch_request(self, node_id, request): needs_wakeup = False try: response = yield from self._client.send(node_id, request) except Errors.KafkaError as err: log.error("Failed fetch messages from %s: %s", node_id, err) return False finally: self._in_flight.remove(node_id) fetch_offsets = {} for topic, partitions in request.topics: for partition, offset, _ in partitions: fetch_offsets[TopicPartition(topic, partition)] = offset for topic, partitions in response.topics: for partition, error_code, highwater, messages in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if not self._subscriptions.is_fetchable(tp): # this can happen when a rebalance happened log.debug( "Ignoring fetched records for partition %s" " since it is no longer fetchable", tp) elif error_type is Errors.NoError: self._subscriptions.assignment[tp].highwater = highwater # we are interested in this fetch only if the beginning # offset matches the current consumed position fetch_offset = fetch_offsets[tp] partial = None if messages and \ isinstance(messages[-1][-1], PartialMessage): partial = messages.pop() if messages: log.debug( "Adding fetched record for partition %s with" " offset %d to buffered record list", tp, fetch_offset) try: messages = collections.deque( self._unpack_message_set(tp, messages)) except Errors.InvalidMessageError as err: self._set_error(tp, err) continue self._records[tp] = FetchResult( tp, messages=messages, subscriptions=self._subscriptions, backoff=self._prefetch_backoff, loop=self._loop) # We added at least 1 successful record needs_wakeup = True elif partial: # we did not read a single message from a non-empty # buffer because that message's size is larger than # fetch size, in this case record this exception err = RecordTooLargeError( "There are some messages at [Partition=Offset]: " "%s=%s whose size is larger than the fetch size %s" " and hence cannot be ever returned. " "Increase the fetch size, or decrease the maximum " "message size the broker will allow.", tp, fetch_offset, self._max_partition_fetch_bytes) self._set_error(tp, err) needs_wakeup = True self._subscriptions.assignment[tp].position += 1 elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): self._client.force_metadata_update() elif error_type is Errors.OffsetOutOfRangeError: fetch_offset = fetch_offsets[tp] if self._subscriptions.has_default_offset_reset_policy(): self._subscriptions.need_offset_reset(tp) else: err = Errors.OffsetOutOfRangeError({tp: fetch_offset}) self._set_error(tp, err) needs_wakeup = True log.info( "Fetch offset %s is out of range, resetting offset", fetch_offset) elif error_type is Errors.TopicAuthorizationFailedError: log.warn("Not authorized to read from topic %s.", tp.topic) err = Errors.TopicAuthorizationFailedError(tp.topic) self._set_error(tp, err) needs_wakeup = True else: log.warn('Unexpected error while fetching data: %s', error_type.__name__) return needs_wakeup
def _handle_fetch_response(self, request, response): """The callback for fetch completion""" #total_bytes = 0 #total_count = 0 fetch_offsets = {} for topic, partitions in request.topics: for partition, offset, _ in partitions: fetch_offsets[TopicPartition(topic, partition)] = offset for topic, partitions in response.topics: for partition, error_code, highwater, messages in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if not self._subscriptions.is_fetchable(tp): # this can happen when a rebalance happened or a partition # consumption paused while fetch is still in-flight log.debug("Ignoring fetched records for partition %s" " since it is no longer fetchable", tp) elif error_type is Errors.NoError: fetch_offset = fetch_offsets[tp] # we are interested in this fetch only if the beginning # offset matches the current consumed position position = self._subscriptions.assignment[tp].position if position is None or position != fetch_offset: log.debug("Discarding fetch response for partition %s" " since its offset %d does not match the" " expected offset %d", tp, fetch_offset, position) continue partial = None if messages and isinstance(messages[-1][-1], PartialMessage): partial = messages.pop() if messages: log.debug("Adding fetched record for partition %s with" " offset %d to buffered record list", tp, position) self._records.append((fetch_offset, tp, messages)) #last_offset, _, _ = messages[-1] #self.sensors.records_fetch_lag.record(highwater - last_offset) elif partial: # we did not read a single message from a non-empty # buffer because that message's size is larger than # fetch size, in this case record this exception self._record_too_large_partitions[tp] = fetch_offset # TODO: bytes metrics #self.sensors.record_topic_fetch_metrics(tp.topic, num_bytes, parsed.size()); #totalBytes += num_bytes; #totalCount += parsed.size(); elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): self._client.cluster.request_update() elif error_type is Errors.OffsetOutOfRangeError: fetch_offset = fetch_offsets[tp] if self._subscriptions.has_default_offset_reset_policy(): self._subscriptions.need_offset_reset(tp) else: self._offset_out_of_range_partitions[tp] = fetch_offset log.info("Fetch offset %s is out of range, resetting offset", fetch_offset) elif error_type is Errors.TopicAuthorizationFailedError: log.warn("Not authorized to read from topic %s.", tp.topic) self._unauthorized_topics.add(tp.topic) elif error_type is Errors.UnknownError: log.warn("Unknown error fetching data for topic-partition %s", tp) else: raise error_type('Unexpected error while fetching data') """TOOD - metrics
def _handle_leave_group_response(self, response): error_type = Errors.for_code(response.error_code) if error_type is Errors.NoError: log.info("LeaveGroup request succeeded") else: log.error("LeaveGroup request failed: %s", error_type())
def update_metadata(self, metadata): """Update cluster state given a MetadataResponse. Arguments: metadata (MetadataResponse): broker response to a metadata request Returns: None """ # In the common case where we ask for a single topic and get back an # error, we should fail the future if len(metadata.topics) == 1 and metadata.topics[0][0] != 0: error_code, topic, _ = metadata.topics[0] error = Errors.for_code(error_code)(topic) return self.failed_update(error) if not metadata.brokers: log.warning("No broker metadata found in MetadataResponse") for node_id, host, port in metadata.brokers: self._brokers.update( {node_id: BrokerMetadata(node_id, host, port)}) _new_partitions = {} _new_broker_partitions = collections.defaultdict(set) _new_unauthorized_topics = set() for error_code, topic, partitions in metadata.topics: error_type = Errors.for_code(error_code) if error_type is Errors.NoError: _new_partitions[topic] = {} for p_error, partition, leader, replicas, isr in partitions: _new_partitions[topic][partition] = PartitionMetadata( topic=topic, partition=partition, leader=leader, replicas=replicas, isr=isr, error=p_error) if leader != -1: _new_broker_partitions[leader].add( TopicPartition(topic, partition)) elif error_type is Errors.LeaderNotAvailableError: log.warning( "Topic %s is not available during auto-create" " initialization", topic) elif error_type is Errors.UnknownTopicOrPartitionError: log.error("Topic %s not found in cluster metadata", topic) elif error_type is Errors.TopicAuthorizationFailedError: log.error("Topic %s is not authorized for this client", topic) _new_unauthorized_topics.add(topic) elif error_type is Errors.InvalidTopicError: log.error("'%s' is not a valid topic name", topic) else: log.error("Error fetching metadata for topic %s: %s", topic, error_type) with self._lock: self._partitions = _new_partitions self._broker_partitions = _new_broker_partitions self.unauthorized_topics = _new_unauthorized_topics f = None if self._future: f = self._future self._future = None self._need_update = False now = time.time() * 1000 self._last_refresh_ms = now self._last_successful_refresh_ms = now if f: f.success(self) log.debug("Updated cluster metadata to %s", self) for listener in self._listeners: listener(self)
def _send_produce_req(self, node_id, batches): """Create produce request to node If producer configured with `retries`>0 and produce response contain "failed" partitions produce request for this partition will try resend to broker `retries` times with `retry_timeout_ms` timeouts. Arguments: node_id (int): kafka broker identifier batches (dict): dictionary of {TopicPartition: MessageBatch} """ self._in_flight.add(node_id) t0 = self._loop.time() while True: topics = collections.defaultdict(list) for tp, batch in batches.items(): topics[tp.topic].append((tp.partition, batch.data())) request = ProduceRequest( required_acks=self._acks, timeout=self._request_timeout_ms, topics=list(topics.items())) try: response = yield from self.client.send(node_id, request) except KafkaError as err: for batch in batches.values(): if not err.retriable or batch.expired(): batch.done(exception=err) log.warning( "Got error produce response: %s", err) if not err.retriable: break else: if response is None: # noacks, just "done" batches for batch in batches.values(): batch.done() break for topic, partitions in response.topics: for partition, error_code, offset in partitions: tp = TopicPartition(topic, partition) error = Errors.for_code(error_code) batch = batches.pop(tp, None) if batch is None: continue if error is Errors.NoError: batch.done(offset) elif not getattr(error, 'retriable', False) or \ batch.expired(): batch.done(exception=error()) else: # Ok, we can retry this batch batches[tp] = batch log.warning( "Got error produce response on topic-partition" " %s, retrying. Error: %s", tp, error) if batches: yield from asyncio.sleep( self._retry_backoff, loop=self._loop) else: break # if batches for node is processed in less than a linger seconds # then waiting for the remaining time sleep_time = self._linger_time - (self._loop.time() - t0) if sleep_time > 0: yield from asyncio.sleep(sleep_time, loop=self._loop) self._in_flight.remove(node_id)
def update_metadata(self, metadata): # In the common case where we ask for a single topic and get back an # error, we should fail the future if len(metadata.topics) == 1 and metadata.topics[0][0] != 0: error_code, topic, _ = metadata.topics[0] error = Errors.for_code(error_code)(topic) return self.failed_update(error) if not metadata.brokers: log.warning("No broker metadata found in MetadataResponse") for node_id, host, port in metadata.brokers: self._brokers.update({ node_id: BrokerMetadata(node_id, host, port) }) _new_partitions = {} _new_broker_partitions = collections.defaultdict(set) _new_unauthorized_topics = set() for error_code, topic, partitions in metadata.topics: error_type = Errors.for_code(error_code) if error_type is Errors.NoError: _new_partitions[topic] = {} for p_error, partition, leader, replicas, isr in partitions: _new_partitions[topic][partition] = PartitionMetadata( topic=topic, partition=partition, leader=leader, replicas=replicas, isr=isr, error=p_error) if leader != -1: _new_broker_partitions[leader].add( TopicPartition(topic, partition)) elif error_type is Errors.LeaderNotAvailableError: log.warning("Topic %s is not available during auto-create" " initialization", topic) elif error_type is Errors.UnknownTopicOrPartitionError: log.error("Topic %s not found in cluster metadata", topic) elif error_type is Errors.TopicAuthorizationFailedError: log.error("Topic %s is not authorized for this client", topic) _new_unauthorized_topics.add(topic) elif error_type is Errors.InvalidTopicError: log.error("'%s' is not a valid topic name", topic) else: log.error("Error fetching metadata for topic %s: %s", topic, error_type) with self._lock: self._partitions = _new_partitions self._broker_partitions = _new_broker_partitions self.unauthorized_topics = _new_unauthorized_topics f = None if self._future: f = self._future self._future = None self._need_update = False now = time.time() * 1000 self._last_refresh_ms = now self._last_successful_refresh_ms = now if f: f.success(self) log.debug("Updated cluster metadata to %s", self) for listener in self._listeners: listener(self)
def _handle_fetch_response(self, request, response): """The callback for fetch completion""" #total_bytes = 0 #total_count = 0 fetch_offsets = {} for topic, partitions in request.topics: for partition, offset, _ in partitions: fetch_offsets[TopicPartition(topic, partition)] = offset for topic, partitions in response.topics: for partition, error_code, highwater, messages in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if not self._subscriptions.is_fetchable(tp): # this can happen when a rebalance happened or a partition # consumption paused while fetch is still in-flight log.debug( "Ignoring fetched records for partition %s" " since it is no longer fetchable", tp) elif error_type is Errors.NoError: self._subscriptions.assignment[tp].highwater = highwater # we are interested in this fetch only if the beginning # offset matches the current consumed position fetch_offset = fetch_offsets[tp] position = self._subscriptions.assignment[tp].position if position is None or position != fetch_offset: log.debug( "Discarding fetch response for partition %s" " since its offset %d does not match the" " expected offset %d", tp, fetch_offset, position) continue partial = None if messages and isinstance(messages[-1][-1], PartialMessage): partial = messages.pop() if messages: log.debug( "Adding fetched record for partition %s with" " offset %d to buffered record list", tp, position) self._records.append((fetch_offset, tp, messages)) #last_offset, _, _ = messages[-1] #self.sensors.records_fetch_lag.record(highwater - last_offset) elif partial: # we did not read a single message from a non-empty # buffer because that message's size is larger than # fetch size, in this case record this exception self._record_too_large_partitions[tp] = fetch_offset # TODO: bytes metrics #self.sensors.record_topic_fetch_metrics(tp.topic, num_bytes, parsed.size()); #totalBytes += num_bytes; #totalCount += parsed.size(); elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): self._client.cluster.request_update() elif error_type is Errors.OffsetOutOfRangeError: fetch_offset = fetch_offsets[tp] if self._subscriptions.has_default_offset_reset_policy(): self._subscriptions.need_offset_reset(tp) else: self._offset_out_of_range_partitions[tp] = fetch_offset log.info( "Fetch offset %s is out of range, resetting offset", fetch_offset) elif error_type is Errors.TopicAuthorizationFailedError: log.warn("Not authorized to read from topic %s.", tp.topic) self._unauthorized_topics.add(tp.topic) elif error_type is Errors.UnknownError: log.warn( "Unknown error fetching data for topic-partition %s", tp) else: raise error_type('Unexpected error while fetching data') """TOOD - metrics
def _handle_offset_commit_response(self, offsets, future, response): #self.sensors.commit_latency.record(response.requestLatencyMs()) unauthorized_topics = set() for topic, partitions in response.topics: for partition, error_code in partitions: tp = TopicPartition(topic, partition) offset = offsets[tp] error_type = Errors.for_code(error_code) if error_type is Errors.NoError: log.debug("Committed offset %s for partition %s", offset, tp) if self._subscription.is_assigned(tp): self._subscription.assignment[tp].committed = offset.offset elif error_type is Errors.GroupAuthorizationFailedError: log.error("OffsetCommit failed for group %s - %s", self.group_id, error_type.__name__) future.failure(error_type(self.group_id)) return elif error_type is Errors.TopicAuthorizationFailedError: unauthorized_topics.add(topic) elif error_type in (Errors.OffsetMetadataTooLargeError, Errors.InvalidCommitOffsetSizeError): # raise the error to the user log.info("OffsetCommit failed for group %s on partition %s" " due to %s, will retry", self.group_id, tp, error_type.__name__) future.failure(error_type()) return elif error_type is Errors.GroupLoadInProgressError: # just retry log.info("OffsetCommit failed for group %s because group is" " initializing (%s), will retry", self.group_id, error_type.__name__) future.failure(error_type(self.group_id)) return elif error_type in (Errors.GroupCoordinatorNotAvailableError, Errors.NotCoordinatorForGroupError, Errors.RequestTimedOutError): log.info("OffsetCommit failed for group %s due to a" " coordinator error (%s), will find new coordinator" " and retry", self.group_id, error_type.__name__) self.coordinator_dead() future.failure(error_type(self.group_id)) return elif error_type in (Errors.UnknownMemberIdError, Errors.IllegalGenerationError, Errors.RebalanceInProgressError): # need to re-join group error = error_type(self.group_id) log.error("OffsetCommit failed for group %s due to group" " error (%s), will rejoin", self.group_id, error) self._subscription.mark_for_reassignment() # Errors.CommitFailedError("Commit cannot be completed due to group rebalance")) future.failure(error) return else: log.error("OffsetCommit failed for group % on partition %s" " with offset %s: %s", self.group_id, tp, offset, error_type.__name__) future.failure(error_type()) return if unauthorized_topics: log.error("OffsetCommit failed for unauthorized topics %s", unauthorized_topics) future.failure(Errors.TopicAuthorizationFailedError(unauthorized_topics)) else: future.success(True)
def commit_offsets(self, offsets): """Commit specific offsets asynchronously. Arguments: offsets (dict {TopicPartition: OffsetAndMetadata}): what to commit Raises error on failure """ self._subscription.needs_fetch_committed_offsets = True if not offsets: log.debug('No offsets to commit') return True if (yield from self.coordinator_unknown()): raise Errors.GroupCoordinatorNotAvailableError() node_id = self.coordinator_id # create the offset commit request offset_data = collections.defaultdict(list) for tp, offset in offsets.items(): offset_data[tp.topic].append( (tp.partition, offset.offset, offset.metadata)) request = OffsetCommitRequest( self.group_id, self.generation, self.member_id, OffsetCommitRequest.DEFAULT_RETENTION_TIME, [(topic, tp_offsets) for topic, tp_offsets in offset_data.items()]) log.debug("Sending offset-commit request with %s for group %s to %s", offsets, self.group_id, node_id) response = yield from self._send_req(node_id, request) unauthorized_topics = set() for topic, partitions in response.topics: for partition, error_code in partitions: tp = TopicPartition(topic, partition) offset = offsets[tp] error_type = Errors.for_code(error_code) if error_type is Errors.NoError: log.debug("Committed offset %s for partition %s", offset, tp) if self._subscription.is_assigned(tp): partition = self._subscription.assignment[tp] partition.committed = offset.offset elif error_type is Errors.GroupAuthorizationFailedError: log.error("OffsetCommit failed for group %s - %s", self.group_id, error_type.__name__) raise error_type() elif error_type is Errors.TopicAuthorizationFailedError: unauthorized_topics.add(topic) elif error_type in (Errors.OffsetMetadataTooLargeError, Errors.InvalidCommitOffsetSizeError): # raise the error to the user log.info( "OffsetCommit failed for group %s on partition %s" " due to %s, will retry", self.group_id, tp, error_type.__name__) raise error_type() elif error_type is Errors.GroupLoadInProgressError: # just retry log.info( "OffsetCommit failed for group %s because group is" " initializing (%s), will retry", self.group_id, error_type.__name__) raise error_type() elif error_type in (Errors.GroupCoordinatorNotAvailableError, Errors.NotCoordinatorForGroupError, Errors.RequestTimedOutError): log.info( "OffsetCommit failed for group %s due to a" " coordinator error (%s), will find new coordinator" " and retry", self.group_id, error_type.__name__) self.coordinator_dead() raise error_type() elif error_type in (Errors.UnknownMemberIdError, Errors.IllegalGenerationError, Errors.RebalanceInProgressError): # need to re-join group error = error_type(self.group_id) log.error( "OffsetCommit failed for group %s due to group" " error (%s), will rejoin", self.group_id, error) self._subscription.mark_for_reassignment() raise error else: log.error( "OffsetCommit failed for group %s on partition %s" " with offset %s: %s", self.group_id, tp, offset, error_type.__name__) raise error_type() if unauthorized_topics: log.error("OffsetCommit failed for unauthorized topics %s", unauthorized_topics) raise Errors.TopicAuthorizationFailedError(unauthorized_topics)
def _proc_fetch_request(self, node_id, request): needs_wakeup = False try: response = yield from self._client.send(node_id, request) except Errors.KafkaError as err: log.error("Failed fetch messages from %s: %s", node_id, err) return False finally: self._in_flight.remove(node_id) fetch_offsets = {} for topic, partitions in request.topics: for partition, offset, _ in partitions: fetch_offsets[TopicPartition(topic, partition)] = offset for topic, partitions in response.topics: for partition, error_code, highwater, messages in partitions: tp = TopicPartition(topic, partition) error_type = Errors.for_code(error_code) if not self._subscriptions.is_fetchable(tp): # this can happen when a rebalance happened log.debug("Ignoring fetched records for partition %s" " since it is no longer fetchable", tp) elif error_type is Errors.NoError: self._subscriptions.assignment[tp].highwater = highwater # we are interested in this fetch only if the beginning # offset matches the current consumed position fetch_offset = fetch_offsets[tp] partial = None if messages and \ isinstance(messages[-1][-1], PartialMessage): partial = messages.pop() if messages: log.debug( "Adding fetched record for partition %s with" " offset %d to buffered record list", tp, fetch_offset) try: messages = collections.deque( self._unpack_message_set(tp, messages)) except Errors.InvalidMessageError as err: self._set_error(tp, err) continue self._records[tp] = FetchResult( tp, messages=messages, subscriptions=self._subscriptions, backoff=self._prefetch_backoff, loop=self._loop) # We added at least 1 successful record needs_wakeup = True elif partial: # we did not read a single message from a non-empty # buffer because that message's size is larger than # fetch size, in this case record this exception err = RecordTooLargeError( "There are some messages at [Partition=Offset]: " "%s=%s whose size is larger than the fetch size %s" " and hence cannot be ever returned. " "Increase the fetch size, or decrease the maximum " "message size the broker will allow.", tp, fetch_offset, self._max_partition_fetch_bytes) self._set_error(tp, err) needs_wakeup = True self._subscriptions.assignment[tp].position += 1 elif error_type in (Errors.NotLeaderForPartitionError, Errors.UnknownTopicOrPartitionError): self._client.force_metadata_update() elif error_type is Errors.OffsetOutOfRangeError: fetch_offset = fetch_offsets[tp] if self._subscriptions.has_default_offset_reset_policy(): self._subscriptions.need_offset_reset(tp) else: err = Errors.OffsetOutOfRangeError({tp: fetch_offset}) self._set_error(tp, err) needs_wakeup = True log.info( "Fetch offset %s is out of range, resetting offset", fetch_offset) elif error_type is Errors.TopicAuthorizationFailedError: log.warn("Not authorized to read from topic %s.", tp.topic) err = Errors.TopicAuthorizationFailedError(tp.topic) self._set_error(tp, err) needs_wakeup = True else: log.warn('Unexpected error while fetching data: %s', error_type.__name__) return needs_wakeup