def _are_subscriptions_identical(self): """ Returns: true, if both potential consumers of partitions and potential partitions that consumers can consume are the same """ if not has_identical_list_elements( list(six.itervalues( self.partition_to_all_potential_consumers))): return False return has_identical_list_elements( list(six.itervalues(self.consumer_to_all_potential_partitions)))
def consumer_thread(i): assert i not in consumers assert i not in stop stop[i] = threading.Event() consumers[i] = KafkaConsumer(topic, bootstrap_servers=connect_str, group_id=group_id, heartbeat_interval_ms=500) while not stop[i].is_set(): for tp, records in six.itervalues(consumers[i].poll(100)): messages[i][tp].extend(records) consumers[i].close() consumers[i] = None stop[i] = None
def assign(cls, cluster, member_metadata): # get all topics and check every memeber has the same all_topics = None for metadata in six.itervalues(member_metadata): if all_topics is None: all_topics = set(metadata.subscription) elif all_topics != set(metadata.subscription): diff = all_topics.symmetric_difference(metadata.subscription) raise UnmergeableTopcis( 'Topic(s) %s do not appear in all members', ', '.join(diff)) # get all partition numbers and check every topic has the same all_partitions = None for topic in all_topics: partitions = cluster.partitions_for_topic(topic) if partitions is None: raise UnmergeableTopcis('No partition metadata for topic %s', topic) if all_partitions is None: all_partitions = set(partitions) elif all_partitions != set(partitions): diff = all_partitions.symmetric_difference(partitions) raise UnmergeableTopcis( 'Partition(s) %s do not appear in all topics', ', '.join(str(p) for p in diff)) all_partitions = sorted(all_partitions) assignment = collections.defaultdict( lambda: collections.defaultdict(list)) # round robin assignation of the partition numbers member_iter = itertools.cycle(sorted(member_metadata.keys())) for partition in all_partitions: member_id = next(member_iter) for topic in all_topics: assignment[member_id][topic].append(partition) protocol_assignment = {} for member_id in member_metadata: protocol_assignment[member_id] = ConsumerProtocolMemberAssignment( cls.version, sorted(assignment[member_id].items()), b'') return protocol_assignment
def assign(cls, cluster, member_metadata): all_topics = set() for metadata in six.itervalues(member_metadata): all_topics.update(metadata.subscription) all_topic_partitions = [] for topic in all_topics: partitions = cluster.partitions_for_topic(topic) if partitions is None: log.warning('No partition metadata for topic %s', topic) continue for partition in partitions: all_topic_partitions.append(TopicPartition(topic, partition)) all_topic_partitions.sort() # construct {member_id: {topic: [partition, ...]}} assignment = collections.defaultdict( lambda: collections.defaultdict(list)) member_iter = itertools.cycle(sorted(member_metadata.keys())) for partition in all_topic_partitions: member_id = next(member_iter) # Because we constructed all_topic_partitions from the set of # member subscribed topics, we should be safe assuming that # each topic in all_topic_partitions is in at least one member # subscription; otherwise this could yield an infinite loop while partition.topic not in member_metadata[ member_id].subscription: member_id = next(member_iter) assignment[member_id][partition.topic].append(partition.partition) protocol_assignment = {} for member_id in member_metadata: protocol_assignment[member_id] = ConsumerProtocolMemberAssignment( cls.version, sorted(assignment[member_id].items()), b'') return protocol_assignment
def assign(cls, cluster, member_metadata): all_topics = set() for metadata in six.itervalues(member_metadata): all_topics.update(metadata.subscription) all_topic_partitions = [] for topic in all_topics: partitions = cluster.partitions_for_topic(topic) if partitions is None: log.warning('No partition metadata for topic %s', topic) continue for partition in partitions: all_topic_partitions.append(TopicPartition(topic, partition)) all_topic_partitions.sort() # construct {member_id: {topic: [partition, ...]}} assignment = collections.defaultdict(lambda: collections.defaultdict(list)) member_iter = itertools.cycle(sorted(member_metadata.keys())) for partition in all_topic_partitions: member_id = next(member_iter) # Because we constructed all_topic_partitions from the set of # member subscribed topics, we should be safe assuming that # each topic in all_topic_partitions is in at least one member # subscription; otherwise this could yield an infinite loop while partition.topic not in member_metadata[member_id].subscription: member_id = next(member_iter) assignment[member_id][partition.topic].append(partition.partition) protocol_assignment = {} for member_id in member_metadata: protocol_assignment[member_id] = ConsumerProtocolMemberAssignment( cls.version, sorted(assignment[member_id].items()), b'') return protocol_assignment
def _poll(self, timeout, sleep=True): # select on reads across all connected sockets, blocking up to timeout assert self.in_flight_request_count() > 0 or self._connecting or sleep responses = [] processed = set() start_select = time.time() ready = self._selector.select(timeout) end_select = time.time() if self._sensors: self._sensors.select_time.record((end_select - start_select) * 1000000000) for key, events in ready: if key.fileobj is self._wake_r: self._clear_wake_fd() continue elif not (events & selectors.EVENT_READ): continue conn = key.data processed.add(conn) if not conn.in_flight_requests: # if we got an EVENT_READ but there were no in-flight requests, one of # two things has happened: # # 1. The remote end closed the connection (because it died, or because # a firewall timed out, or whatever) # 2. The protocol is out of sync. # # either way, we can no longer safely use this connection # # Do a 1-byte read to check protocol didnt get out of sync, and then close the conn try: unexpected_data = key.fileobj.recv(1) if unexpected_data: # anything other than a 0-byte read means protocol issues log.warning('Protocol out of sync on %r, closing', conn) except socket.error: pass conn.close() continue # Accumulate as many responses as the connection has pending while conn.in_flight_requests: response = conn.recv() # Note: conn.recv runs callbacks / errbacks # Incomplete responses are buffered internally # while conn.in_flight_requests retains the request if not response: break responses.append(response) # Check for additional pending SSL bytes if self.config['security_protocol'] in ('SSL', 'SASL_SSL'): # TODO: optimize for conn in self._conns.values(): if conn not in processed and conn.connected() and conn._sock.pending(): response = conn.recv() if response: responses.append(response) for conn in six.itervalues(self._conns): if conn.requests_timed_out(): log.warning('%s timed out after %s ms. Closing connection.', conn, conn.config['request_timeout_ms']) conn.close(error=Errors.RequestTimedOutError( 'Request timed out after %s ms' % conn.config['request_timeout_ms'])) if self._sensors: self._sensors.io_time.record((time.time() - end_select) * 1000000000) return responses
def _poll(self, timeout): """Returns list of (response, future) tuples""" processed = set() start_select = time.time() ready = self._selector.select(timeout) end_select = time.time() if self._sensors: self._sensors.select_time.record( (end_select - start_select) * 1000000000) for key, events in ready: if key.fileobj is self._wake_r: self._clear_wake_fd() continue elif not (events & selectors.EVENT_READ): continue conn = key.data processed.add(conn) if not conn.in_flight_requests: # if we got an EVENT_READ but there were no in-flight requests, one of # two things has happened: # # 1. The remote end closed the connection (because it died, or because # a firewall timed out, or whatever) # 2. The protocol is out of sync. # # either way, we can no longer safely use this connection # # Do a 1-byte read to check protocol didnt get out of sync, and then close the conn try: unexpected_data = key.fileobj.recv(1) if unexpected_data: # anything other than a 0-byte read means protocol issues log.warning('Protocol out of sync on %r, closing', conn) except socket.error: pass conn.close( Errors.ConnectionError( 'Socket EVENT_READ without in-flight-requests')) continue self._idle_expiry_manager.update(conn.node_id) self._pending_completion.extend(conn.recv()) # Check for additional pending SSL bytes if self.config['security_protocol'] in ('SSL', 'SASL_SSL'): # TODO: optimize for conn in self._conns.values(): if conn not in processed and conn.connected( ) and conn._sock.pending(): self._pending_completion.extend(conn.recv()) for conn in six.itervalues(self._conns): if conn.requests_timed_out(): log.warning('%s timed out after %s ms. Closing connection.', conn, conn.config['request_timeout_ms']) conn.close(error=Errors.RequestTimedOutError( 'Request timed out after %s ms' % conn.config['request_timeout_ms'])) if self._sensors: self._sensors.io_time.record( (time.time() - end_select) * 1000000000) self._maybe_close_oldest_connection()
def _poll(self, timeout, sleep=True): # select on reads across all connected sockets, blocking up to timeout assert self.in_flight_request_count() > 0 or self._connecting or sleep responses = [] processed = set() start_select = time.time() ready = self._selector.select(timeout) end_select = time.time() if self._sensors: self._sensors.select_time.record((end_select - start_select) * 1000000000) for key, events in ready: if key.fileobj is self._wake_r: self._clear_wake_fd() continue elif not (events & selectors.EVENT_READ): continue conn = key.data processed.add(conn) if not conn.in_flight_requests: # if we got an EVENT_READ but there were no in-flight requests, one of # two things has happened: # # 1. The remote end closed the connection (because it died, or because # a firewall timed out, or whatever) # 2. The protocol is out of sync. # # either way, we can no longer safely use this connection # # Do a 1-byte read to check protocol didnt get out of sync, and then close the conn try: unexpected_data = key.fileobj.recv(1) if unexpected_data: # anything other than a 0-byte read means protocol issues log.warning('Protocol out of sync on %r, closing', conn) except socket.error: pass conn.close(Errors.ConnectionError('Socket EVENT_READ without in-flight-requests')) continue self._idle_expiry_manager.update(conn.node_id) # Accumulate as many responses as the connection has pending while conn.in_flight_requests: response = conn.recv() # Note: conn.recv runs callbacks / errbacks # Incomplete responses are buffered internally # while conn.in_flight_requests retains the request if not response: break responses.append(response) # Check for additional pending SSL bytes if self.config['security_protocol'] in ('SSL', 'SASL_SSL'): # TODO: optimize for conn in self._conns.values(): if conn not in processed and conn.connected() and conn._sock.pending(): response = conn.recv() if response: responses.append(response) for conn in six.itervalues(self._conns): if conn.requests_timed_out(): log.warning('%s timed out after %s ms. Closing connection.', conn, conn.config['request_timeout_ms']) conn.close(error=Errors.RequestTimedOutError( 'Request timed out after %s ms' % conn.config['request_timeout_ms'])) if self._sensors: self._sensors.io_time.record((time.time() - end_select) * 1000000000) self._maybe_close_oldest_connection() return responses
def run_once(self): """Run a single iteration of sending.""" while self._topics_to_add: self._client.add_topic(self._topics_to_add.pop()) # get the list of partitions with data ready to send result = self._accumulator.ready(self._metadata) ready_nodes, next_ready_check_delay, unknown_leaders_exist = result # if there are any partitions whose leaders are not known yet, force # metadata update if unknown_leaders_exist: log.debug('Unknown leaders exist, requesting metadata update') self._metadata.request_update() # remove any nodes we aren't ready to send to not_ready_timeout = 999999999 for node in list(ready_nodes): if not self._client.ready(node): log.debug('Node %s not ready; delaying produce of accumulated batch', node) ready_nodes.remove(node) not_ready_timeout = min(not_ready_timeout, self._client.connection_delay(node)) # create produce requests batches_by_node = self._accumulator.drain( self._metadata, ready_nodes, self.config['max_request_size']) if self.config['guarantee_message_order']: # Mute all the partitions drained for batch_list in six.itervalues(batches_by_node): for batch in batch_list: self._accumulator.muted.add(batch.topic_partition) expired_batches = self._accumulator.abort_expired_batches( self.config['request_timeout_ms'], self._metadata) for expired_batch in expired_batches: self._sensors.record_errors(expired_batch.topic_partition.topic, expired_batch.record_count) self._sensors.update_produce_request_metrics(batches_by_node) requests = self._create_produce_requests(batches_by_node) # If we have any nodes that are ready to send + have sendable data, # poll with 0 timeout so this can immediately loop and try sending more # data. Otherwise, the timeout is determined by nodes that have # partitions with data that isn't yet sendable (e.g. lingering, backing # off). Note that this specifically does not include nodes with # sendable data that aren't ready to send since they would cause busy # looping. poll_timeout_ms = min(next_ready_check_delay * 1000, not_ready_timeout) if ready_nodes: log.debug("Nodes with data ready to send: %s", ready_nodes) # trace log.debug("Created %d produce requests: %s", len(requests), requests) # trace poll_timeout_ms = 0 for node_id, request in six.iteritems(requests): batches = batches_by_node[node_id] log.debug('Sending Produce Request: %r', request) (self._client.send(node_id, request) .add_callback( self._handle_produce_response, node_id, time.time(), batches) .add_errback( self._failed_produce, batches, node_id)) # if some partitions are already ready to be sent, the select time # would be 0; otherwise if some partition already has some data # accumulated but not ready yet, the select time will be the time # difference between now and its linger expiry time; otherwise the # select time will be the time difference between now and the # metadata expiry time self._client.poll(poll_timeout_ms, sleep=True)
def _send_broker_aware_request(self, payloads, encoder_fn, decoder_fn): """ Group a list of request payloads by topic+partition and send them to the leader broker for that partition using the supplied encode/decode functions Arguments: payloads: list of object-like entities with a topic (str) and partition (int) attribute; payloads with duplicate topic-partitions are not supported. encode_fn: a method to encode the list of payloads to a request body, must accept client_id, correlation_id, and payloads as keyword arguments decode_fn: a method to decode a response body into response objects. The response objects must be object-like and have topic and partition attributes Returns: List of response objects in the same order as the supplied payloads """ # encoders / decoders do not maintain ordering currently # so we need to keep this so we can rebuild order before returning original_ordering = [(p.topic, p.partition) for p in payloads] # Connection errors generally mean stale metadata # although sometimes it means incorrect api request # Unfortunately there is no good way to tell the difference # so we'll just reset metadata on all errors to be safe refresh_metadata = False # For each broker, send the list of request payloads # and collect the responses and errors payloads_by_broker = self._payloads_by_broker(payloads) responses = {} def failed_payloads(payloads): for payload in payloads: topic_partition = (str(payload.topic), payload.partition) responses[topic_partition] = FailedPayloadsError(payload) futures_by_connection = {} selector = selectors.DefaultSelector() for broker, broker_payloads in six.iteritems(payloads_by_broker): if broker is None: failed_payloads(broker_payloads) continue host, port, afi = get_ip_port_afi(broker.host) try: conn = self._get_conn(host, broker.port, afi, broker.nodeId) except ConnectionError: refresh_metadata = True failed_payloads(broker_payloads) continue request = encoder_fn(payloads=broker_payloads) # decoder_fn=None signal that the server is expected to not # send a response. This probably only applies to # ProduceRequest w/ acks = 0 expect_response = (decoder_fn is not None) if expect_response: selector.register(conn._sock, selectors.EVENT_READ, conn) future = conn.send(request, expect_response=expect_response) if future.failed(): log.error("Request failed: %s", future.exception) selector.unregister(conn._sock) refresh_metadata = True failed_payloads(broker_payloads) continue if not expect_response: for payload in broker_payloads: topic_partition = (str(payload.topic), payload.partition) responses[topic_partition] = None continue futures_by_connection[conn] = (future, broker) timeout = self.timeout while futures_by_connection: start_time = time.time() ready = selector.select(timeout) for key, _ in ready: conn = key.data future, _ = futures_by_connection[conn] while not future.is_done: conn.recv() _, broker = futures_by_connection.pop(conn) if future.failed(): log.error("Request failed: %s", future.exception) refresh_metadata = True failed_payloads(payloads_by_broker[broker]) else: for payload_response in decoder_fn(future.value): topic_partition = (str(payload_response.topic), payload_response.partition) responses[topic_partition] = payload_response timeout -= time.time() - start_time if timeout < 0: log.error("%s requests timed out.", len(futures_by_connection)) for _, broker in six.itervalues(futures_by_connection): failed_payloads(payloads_by_broker[broker]) refresh_metadata = True break if refresh_metadata: self.reset_all_metadata() selector.close() # Return responses in the same order as provided return [responses[tp] for tp in original_ordering]
def _poll(self, timeout): # This needs to be locked, but since it is only called from within the # locked section of poll(), there is no additional lock acquisition here processed = set() # Send pending requests first, before polling for responses self._register_send_sockets() start_select = time.time() ready = self._selector.select(timeout) end_select = time.time() if self._sensors: self._sensors.select_time.record( (end_select - start_select) * 1000000000) for key, events in ready: if key.fileobj is self._wake_r: self._clear_wake_fd() continue # Send pending requests if socket is ready to write if events & selectors.EVENT_WRITE: conn = key.data if conn.connecting(): conn.connect() else: if conn.send_pending_requests_v2(): # If send is complete, we dont need to track write readiness # for this socket anymore if key.events ^ selectors.EVENT_WRITE: self._selector.modify( key.fileobj, key.events ^ selectors.EVENT_WRITE, key.data) else: self._selector.unregister(key.fileobj) if not (events & selectors.EVENT_READ): continue conn = key.data processed.add(conn) if not conn.in_flight_requests: # if we got an EVENT_READ but there were no in-flight requests, one of # two things has happened: # # 1. The remote end closed the connection (because it died, or because # a firewall timed out, or whatever) # 2. The protocol is out of sync. # # either way, we can no longer safely use this connection # # Do a 1-byte read to check protocol didnt get out of sync, and then close the conn try: unexpected_data = key.fileobj.recv(1) if unexpected_data: # anything other than a 0-byte read means protocol issues log.warning('Protocol out of sync on %r, closing', conn) except socket.error: pass conn.close( Errors.KafkaConnectionError( 'Socket EVENT_READ without in-flight-requests')) continue self._idle_expiry_manager.update(conn.node_id) self._pending_completion.extend(conn.recv()) # Check for additional pending SSL bytes if self.config['security_protocol'] in ('SSL', 'SASL_SSL'): # TODO: optimize for conn in self._conns.values(): if conn not in processed and conn.connected( ) and conn._sock.pending(): self._pending_completion.extend(conn.recv()) for conn in six.itervalues(self._conns): if conn.requests_timed_out(): log.warning('%s timed out after %s ms. Closing connection.', conn, conn.config['request_timeout_ms']) conn.close(error=Errors.RequestTimedOutError( 'Request timed out after %s ms' % conn.config['request_timeout_ms'])) if self._sensors: self._sensors.io_time.record( (time.time() - end_select) * 1000000000) self._maybe_close_oldest_connection()
def _poll(self, timeout): """Returns list of (response, future) tuples""" processed = set() start_select = time.time() ready = self._selector.select(timeout) end_select = time.time() if self._sensors: self._sensors.select_time.record((end_select - start_select) * 1000000000) for key, events in ready: if key.fileobj is self._wake_r: self._clear_wake_fd() continue elif not (events & selectors.EVENT_READ): continue conn = key.data processed.add(conn) if not conn.in_flight_requests: # if we got an EVENT_READ but there were no in-flight requests, one of # two things has happened: # # 1. The remote end closed the connection (because it died, or because # a firewall timed out, or whatever) # 2. The protocol is out of sync. # # either way, we can no longer safely use this connection # # Do a 1-byte read to check protocol didnt get out of sync, and then close the conn try: unexpected_data = key.fileobj.recv(1) if unexpected_data: # anything other than a 0-byte read means protocol issues log.warning('Protocol out of sync on %r, closing', conn) except socket.error: pass conn.close(Errors.KafkaConnectionError('Socket EVENT_READ without in-flight-requests')) continue self._idle_expiry_manager.update(conn.node_id) self._pending_completion.extend(conn.recv()) # Check for additional pending SSL bytes if self.config['security_protocol'] in ('SSL', 'SASL_SSL'): # TODO: optimize for conn in self._conns.values(): if conn not in processed and conn.connected() and conn._sock.pending(): self._pending_completion.extend(conn.recv()) for conn in six.itervalues(self._conns): if conn.requests_timed_out(): log.warning('%s timed out after %s ms. Closing connection.', conn, conn.config['request_timeout_ms']) conn.close(error=Errors.RequestTimedOutError( 'Request timed out after %s ms' % conn.config['request_timeout_ms'])) if self._sensors: self._sensors.io_time.record((time.time() - end_select) * 1000000000) self._maybe_close_oldest_connection()