def __init__(self, topic, broker_partitions, end_broker_partitions=None): """If broker_partitions is a list of BrokerPartitions, we assume that we'll start at the latest offset. If broker_partitions is a mapping of BrokerPartitions to offsets, we'll start at those offsets.""" self._topic = topic self._broker_partitions = sorted(broker_partitions) self._stats = defaultdict( lambda: ConsumerStats(fetches=0, bytes=0, messages=0, max_fetch=0)) self._bps_to_next_offsets = broker_partitions # This will collapse duplicaets so we only have one conn per host/port broker_conn_info = frozenset( (bp.broker_id, bp.host, bp.port) for bp in self._broker_partitions) self._connections = dict((broker_id, Kafka(host, port)) for broker_id, host, port in broker_conn_info) # Figure out where we're going to start from... if isinstance(broker_partitions, Mapping): self._bps_to_next_offsets = broker_partitions else: self._bps_to_next_offsets = dict( (bp, self._connections[bp].latest_offset(bp.topic, bp.partition)) for bp in broker_partitions) self._end_broker_partitions = end_broker_partitions or {}
def __init__(self, zk_conn, consumer_group, topic, autocommit=True): """FIXME: switch arg order and default zk_conn to localhost?""" # Simple attributes we return as properties self._id = self._create_consumer_id(consumer_group) self._topic = topic self._consumer_group = consumer_group self._autocommit = autocommit # Internal vars self._zk_util = ZKUtil(zk_conn) self._needs_rebalance = True self._broker_partitions = [] # Updated during rebalancing self._bps_to_next_offsets = {} # Updated after a successful fetch self._rebalance_enabled = True # Only used for debugging purposes # These are to handle ZooKeeper notification subscriptions. self._topic_watch = None self._topics_watch = None self._consumers_watch = None self._brokers_watch = None # Register ourselves with ZK so other Consumers know we're active. self._register() # Force a rebalance so we know which broker-partitions we own self.rebalance() self._stats = ConsumerStats(fetches=0, bytes=0, messages=0, max_fetch=0)
def stats(self): ''' Returns the aggregate of the stats from all the broker partitions ''' fetches = 0 bytes = 0 messages = 0 max_fetch = 0 for stats in self._stats.values(): fetches += stats.fetches bytes += stats.bytes messages += stats.messages max_fetch = max(max_fetch, stats.max_fetch) return ConsumerStats(fetches, bytes, messages, max_fetch)
def fetch(self, max_size=None, min_size=None, fetch_step=None): log.debug("Fetch called on SimpleConsumer {0}".format(self.id)) bps_to_offsets = self._bps_to_next_offsets # Do all the fetches we need to (this should get replaced with # multifetch or performance is going to suck wind later)... message_sets = [] # We only iterate over those broker partitions for which we have offsets for bp in bps_to_offsets: offset = bps_to_offsets[bp] kafka = self._connections[bp.broker_id] offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, min_size=min_size, max_size=max_size, fetch_step=fetch_step) msg_set = MessageSet(bp, offset, offsets_msgs) # fetches bytes messages max_fetch old_stats = self._stats[bp] self._stats[bp] = ConsumerStats( fetches=old_stats.fetches + 1, bytes=old_stats.bytes + msg_set.size, messages=old_stats.messages + len(msg_set), max_fetch=max(old_stats.max_fetch, msg_set.size)) message_sets.append(msg_set) if message_sets: result = FetchResult(sorted(message_sets)) else: result = FetchResult([]) # Filter out broker partitions whose end offsets we've exceeded self._bps_to_next_offsets = {} for msg_result in result: bp = msg_result.broker_partition next_offset = msg_result.next_offset end_offset = self._end_broker_partitions.get(bp, None) if end_offset is None or next_offset <= end_offset: self._bps_to_next_offsets[bp] = next_offset return result
def fetch(self, max_size=None, retry_limit=3, ignore_failures=False): """Return a FetchResult, which can be iterated over as a list of MessageSets. A MessageSet is returned for every broker partition that is successfully queried, even if that MessageSet is empty. FIXME: This is where the adjustment needs to happen. Regardless of whether a rebalance has occurred or not, we can very easily see if we are still responsible for the same partitions as we were the last time we ran, and set self._bps_to_next_offsets --> we just need to check if it's not None and if we still have the same offsets, and adjust accordingly. """ log.debug("Fetch called on ZKConsumer {0}".format(self.id)) if self._needs_rebalance: self.rebalance() # Find where we're starting from... offsets_pulled_from_zk = False if self._bps_to_next_offsets: # We've already done a fetch, we use our internal value. This is # also all we can do in the case where autocommit is off, since any # value in ZK will be out of date bps_to_offsets = self._bps_to_next_offsets else: # In this case, it's our first fetch, and we need to ask ZooKeeper # for our start value. That being said, if the value from ZooKeeper # is out of range for any given partition, we'll simply start at the # most recent value for that partition. bps_to_offsets = self._zk_util.offsets_for(self.consumer_group, self._id, self.broker_partitions) offsets_pulled_from_zk = True # Do all the fetches we need to (this should get replaced with # multifetch or performance is going to suck wind later)... message_sets = [] # We only iterate over those broker partitions for which we have offsets for bp in bps_to_offsets: offset = bps_to_offsets[bp] kafka = self._connections[bp.broker_id] partition = kafka.partition(bp.topic, bp.partition) if offset is None: offset = partition.latest_offset() try: offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) # If our fetch fails because it's out of range, and the values came # from ZK originally (not our internal incrementing), we assume ZK # is somehow stale, so we just grab the latest and march on. except OffsetOutOfRange as ex: if offsets_pulled_from_zk: log.error( "Offset {0} from ZooKeeper is out of range for {1}". format(offset, bp)) offset = partition.latest_offset() log.error("Retrying with offset {0} for {1}".format( offset, bp)) offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) else: raise except KafkaError as k_err: if ignore_failures: log.error("Ignoring failed fetch on {0}".format(bp)) log.exception(k_err) continue else: raise message_sets.append(MessageSet(bp, offset, offsets_msgs)) result = FetchResult(sorted(message_sets)) # Now persist our new offsets for msg_set in result: self._bps_to_next_offsets[ msg_set.broker_partition] = msg_set.next_offset if self._autocommit: self.commit_offsets() old_stats = self._stats # fetches bytes messages max_fetch self._stats = ConsumerStats( fetches=old_stats.fetches + 1, bytes=old_stats.bytes + result.num_bytes, messages=old_stats.messages + result.num_messages, max_fetch=max(old_stats.max_fetch, result.num_bytes)) return result
def fetch(self, max_size=None, retry_limit=3, ignore_failures=False): """Return a FetchResult, which can be iterated over as a list of MessageSets. A MessageSet is returned for every broker partition that is successfully queried, even if that MessageSet is empty. FIXME: This is where the adjustment needs to happen. Regardless of whether a rebalance has occurred or not, we can very easily see if we are still responsible for the same partitions as we were the last time we ran, and set self._bps_to_next_offsets --> we just need to check if it's not None and if we still have the same offsets, and adjust accordingly. """ def needs_offset_values_from_zk(bps_to_offsets): """We need to pull offset values from ZK if we have no BrokerPartitions in our BPs -> Offsets mapping, or if some of those Offsets are unknown (None)""" return (not bps_to_offsets) or (None in bps_to_offsets.values()) log.debug("Fetch called on ZKConsumer {0}".format(self.id)) if self._needs_rebalance: self.rebalance() # Find where we're starting from. If we've already done a fetch, we use # our internal value. This is also all we can do in the case where # autocommit is off, since any value in ZK will be out of date. bps_to_offsets = dict(self._bps_to_next_offsets) offsets_pulled_from_zk = False if needs_offset_values_from_zk(bps_to_offsets): # We have some offsets, but we've been made responsible for new # BrokerPartitions that we need to lookup. if bps_to_offsets: bps_needing_offsets = [ bp for bp, offset in bps_to_offsets.items() if offset is None ] # Otherwise, it's our first fetch, so we need everything else: bps_needing_offsets = self.broker_partitions bps_to_offsets.update( self._zk_util.offsets_for(self.consumer_group, self._id, bps_needing_offsets)) offsets_pulled_from_zk = True # Do all the fetches we need to (this should get replaced with # multifetch or performance is going to suck wind later)... message_sets = [] # We only iterate over those broker partitions for which we have offsets for bp in bps_to_offsets: offset = bps_to_offsets[bp] kafka = self._connections[bp.broker_id] partition = kafka.partition(bp.topic, bp.partition) if offset is None: offset = partition.latest_offset() try: offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) # If our fetch fails because it's out of range, and the values came # from ZK originally (not our internal incrementing), we assume ZK # is somehow stale, so we just grab the latest and march on. except OffsetOutOfRange as ex: if offsets_pulled_from_zk: log.error( "Offset {0} from ZooKeeper is out of range for {1}". format(offset, bp)) offset = partition.latest_offset() log.error("Retrying with offset {0} for {1}".format( offset, bp)) offsets_msgs = kafka.fetch(bp.topic, offset, partition=bp.partition, max_size=max_size) else: raise except KafkaError as k_err: if ignore_failures: log.error("Ignoring failed fetch on {0}".format(bp)) log.exception(k_err) continue else: raise msg_set = MessageSet(bp, offset, offsets_msgs) # fetches bytes messages max_fetch old_stats = self._stats[bp] self._stats[bp] = ConsumerStats( fetches=old_stats.fetches + 1, bytes=old_stats.bytes + msg_set.size, messages=old_stats.messages + len(msg_set), max_fetch=max(old_stats.max_fetch, msg_set.size)) message_sets.append(msg_set) result = FetchResult(sorted(message_sets)) # Now persist our new offsets for msg_set in result: self._bps_to_next_offsets[ msg_set.broker_partition] = msg_set.next_offset if self._autocommit: self.commit_offsets() return result