async def getmany(self, active_partitions: Set[TP], timeout: float) -> RecordMap: # Implementation for the Fetcher service. _consumer = self._ensure_consumer() fetcher = _consumer._fetcher if _consumer._closed or fetcher._closed: raise ConsumerStoppedError() return await self.call_thread(fetcher.fetched_records, active_partitions, timeout=timeout, max_records=_consumer._max_poll_records)
def getmany(self, *partitions, timeout_ms=0, max_records=None): """Get messages from assigned topics / partitions. Prefetched messages are returned in batches by topic-partition. If messages is not available in the prefetched buffer this method waits `timeout_ms` milliseconds. Arguments: partitions (List[TopicPartition]): The partitions that need fetching message. If no one partition specified then all subscribed partitions will be used timeout_ms (int, optional): milliseconds spent waiting if data is not available in the buffer. If 0, returns immediately with any records that are available currently in the buffer, else returns empty. Must not be negative. Default: 0 Returns: dict: topic to list of records since the last fetch for the subscribed list of topics and partitions Example usage: .. code:: python data = await consumer.getmany() for tp, messages in data.items(): topic = tp.topic partition = tp.partition for message in messages: # Process message print(message.offset, message.key, message.value) """ assert all(map(lambda k: isinstance(k, TopicPartition), partitions)) if self._closed: raise ConsumerStoppedError() if max_records is not None and ( not isinstance(max_records, int) or max_records < 1): raise ValueError("`max_records` must be a positive Integer") timeout = timeout_ms / 1000 records = yield from self._wait_for_data_or_error( self._fetcher.fetched_records( partitions, timeout, max_records=max_records or self._max_poll_records), shield=False ) return records
async def _fetch_records(self, consumer: aiokafka.AIOKafkaConsumer, active_partitions: Set[TP], timeout: float = None, max_records: int = None) -> RecordMap: if not self.consumer.flow_active: return {} fetcher = consumer._fetcher if consumer._closed or fetcher._closed: raise ConsumerStoppedError() with fetcher._subscriptions.fetch_context(): return await fetcher.fetched_records( active_partitions, timeout=timeout, max_records=max_records, )
def close(self): self._fetch_task.cancel() try: yield from self._fetch_task except asyncio.CancelledError: pass # Fail all pending fetchone/fetchall calls if self._wait_empty_future is not None and \ not self._wait_empty_future.done(): self._wait_empty_future.set_exception(ConsumerStoppedError()) for x in self._fetch_tasks: x.cancel() try: yield from x except asyncio.CancelledError: pass
def next_record(self, partitions): """ Return one fetched records This method will contain a little overhead as we will do more work this way: * Notify prefetch routine per every consumed partition * Assure message marked for autocommit """ while True: if self._closed: raise ConsumerStoppedError() # While the background routine will fetch new records up till new # assignment is finished, we don't want to return records, that may # not belong to this instance after rebalance. if self._subscriptions.reassignment_in_progress: yield from self._subscriptions.wait_for_assignment() for tp in list(self._records.keys()): if partitions and tp not in partitions: # Cleanup results for unassigned partitons if not self._subscriptions.is_assigned(tp): del self._records[tp] continue res_or_error = self._records[tp] if type(res_or_error) == FetchResult: message = res_or_error.getone() if message is None: # We already processed all messages, request new ones del self._records[tp] self._notify(self._wait_consume_future) else: return message else: # Remove error, so we can fetch on partition again del self._records[tp] self._notify(self._wait_consume_future) res_or_error.check_raise() # No messages ready. Wait for some to arrive waiter = self._create_fetch_waiter() yield from waiter
def getone(self, *partitions): """ Get one message from Kafka. If no new messages prefetched, this method will wait for it. Arguments: partitions (List[TopicPartition]): Optional list of partitions to return from. If no partitions specified then returned message will be from any partition, which consumer is subscribed to. Returns: ConsumerRecord Will return instance of .. code:: python collections.namedtuple( "ConsumerRecord", ["topic", "partition", "offset", "key", "value"]) Example usage: .. code:: python while True: message = await consumer.getone() topic = message.topic partition = message.partition # Process message print(message.offset, message.key, message.value) """ assert all(map(lambda k: isinstance(k, TopicPartition), partitions)) if self._closed: raise ConsumerStoppedError() msg = yield from self._wait_for_data_or_error( self._fetcher.next_record(partitions), shield=False) return msg
def __aiter__(self): if self._closed: raise ConsumerStoppedError() return self
async def getmany(self, timeout: float) -> AsyncIterator[Tuple[TP, Message]]: # Implementation for the Fetcher service. _consumer = self._consumer fetcher = _consumer._fetcher if _consumer._closed or fetcher._closed: raise ConsumerStoppedError() active_partitions = self._get_active_partitions() _next = next records: RecordMap = {} # This lock is acquired by pause_partitions/resume_partitions, # but those should never be called when the Fetcher is running. with self._partitions_lock: if active_partitions: # Fetch records only if active partitions to avoid the risk of # fetching all partitions in the beginning when none of the # partitions is paused/resumed. records = await fetcher.fetched_records( active_partitions, timeout=timeout, ) else: # We should still release to the event loop await self.sleep(0) create_message = ConsumerMessage # localize # records' contain mapping from TP to list of messages. # if there are two agents, consuming from topics t1 and t2, # normal order of iteration would be to process each # tp in the dict: # for tp. messages in records.items(): # for message in messages: # yield tp, message # # The problem with this, is if we have prefetched 16k records # for one partition, the other partitions won't even start processing # before those 16k records are completed. # # So we try round-robin between the tps instead: # # iterators: Dict[TP, Iterator] = { # tp: iter(messages) # for tp, messages in records.items() # } # while iterators: # for tp, messages in iterators.items(): # yield tp, next(messages) # # remove from iterators if empty. # # The problem with this implementation is that # the records mapping is ordered by TP, so records.keys() # will look like this: # # TP(topic='bar', partition=0) # TP(topic='bar', partition=1) # TP(topic='bar', partition=2) # TP(topic='bar', partition=3) # TP(topic='foo', partition=0) # TP(topic='foo', partition=1) # TP(topic='foo', partition=2) # TP(topic='foo', partition=3) # # If there are 100 partitions for each topic, # it will process 100 items in the first topic, then 100 items # in the other topic, but even worse if partition counts # vary greatly, t1 has 1000 partitions and t2 # has 1 partition, then t2 will end up being starved most of the time. # # We solve this by going round-robin through each topic. topic_index = self._records_to_topic_index(records, active_partitions) to_remove: Set[str] = set() sentinel = object() while topic_index: for topic in to_remove: topic_index.pop(topic, None) for topic, messages in topic_index.items(): item = _next(messages, sentinel) if item is sentinel: # this topic is now empty, # but we cannot remove from dict while iterating over it, # so move that to the outer loop. to_remove.add(topic) continue tp, record = item # type: ignore highwater_mark = self._consumer.highwater(tp) self.app.monitor.track_tp_end_offset(tp, highwater_mark) yield tp, create_message( record.topic, record.partition, record.offset, record.timestamp / 1000.0, record.timestamp_type, record.key, record.value, record.checksum, record.serialized_key_size, record.serialized_value_size, tp, )