def get_graph_data(self): consumer = Consumer(self.conf) consumer.subscribe([self.topic]) # update low and high offsets (don't work without it) consumer.get_watermark_offsets(self.partition) # set local offset consumer.assign([self.partition]) self.__update_que(consumer) # convert data to compatible format o = {key: list(value) for key, value in self.data.items()} return o
def kafka_GetOffset(self, p_szTopicName, p_szGroupID=''): if self.__kafka_servers__ is None: raise SQLCliException( "Missed kafka server information. Please use set kafka server first .." ) c = Consumer({ 'bootstrap.servers': self.__kafka_servers__, 'group.id': p_szGroupID, }) m_OffsetResults = [] try: for pid in c.list_topics(topic=p_szTopicName ).topics[p_szTopicName].partitions.keys(): tp = TopicPartition(p_szTopicName, pid) (low, high) = c.get_watermark_offsets(tp) m_OffsetResults.append([pid, low, high]) if len(m_OffsetResults) == 0: raise SQLCliException("Topic [" + p_szTopicName + "] does not exist!") return m_OffsetResults except KafkaException as ke: if "SQLCLI_DEBUG" in os.environ: print('traceback.print_exc():\n%s' % traceback.print_exc()) print('traceback.format_exc():\n%s' % traceback.format_exc()) raise ke
class KafkaConsumer(object): def __init__(self, group_id, topic): self.client = Consumer({ 'bootstrap.servers': KAFKA_SERVER_HOSTS, 'group.id': group_id, 'session.timeout.ms': 6000, 'default.topic.config': { 'auto.offset.reset': 'smallest' } }) self.topic = topic def query_kafka(self, max_part): for p_id in range(0, max_part): tp = TopicPartition(self.topic, p_id) committed = self.client.committed([tp]) watermark_offsets = self.client.get_watermark_offsets(tp) c_offset = committed[0].offset partition = committed[0].partition min_offset = watermark_offsets[0] max_offset = watermark_offsets[1] print("%d %d %d %d %d" % (partition, min_offset, c_offset, max_offset, max_offset - c_offset)) def reset_kafka(self, tps): for tp in tps: self.client.assign([tp]) print(tp) self.client.poll() def close(self): self.client.close()
def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.poll() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.consume() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.unassign() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.assignment() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.commit() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert ex.match('Consumer closed')
def get_latest_applied(client_options, topic_name, read_timeout=1.0): client_options.update({ 'auto.offset.reset': 'latest', 'enable.auto.commit': False, }) c = Consumer(client_options) partition = TopicPartition(topic_name, 0) low, high = c.get_watermark_offsets(partition) if low is not None and high is not None and high > 0: last_msg_offset = high - 1 else: last_msg_offset = 0 partition = TopicPartition(topic_name, 0, last_msg_offset) c.assign([partition]) read = None msg = c.consume(num_messages=1, timeout=read_timeout) if msg: read = msg[0].value().decode('utf-8') # print('Read: {}'.format(read)) c.close() return read
def get_metrics_for_partition(consumer: Consumer, partition: TopicPartition) -> dict: timeout = 5 watermarks = consumer.get_watermark_offsets(partition, timeout=timeout, cached=False) if watermarks is None: raise Exception(f'Getting watermarks for partition:{partition.partition} on topic: {partition.topic} has taken longer than timeout {timeout} seconds') (low, high) = watermarks # possible negative values for partition offset or high are defined by the following consts # confluent_kafka.OFFSET_BEGINNING == -2 # confluent_kafka.OFFSET_END == -1 # confluent_kafka.OFFSET_STORED == -1000 # confluent_kafka.OFFSET_INVALID == -1001 if high < 0: lag = 0 # Unlikely elif partition.offset < 0: # No committed offset, show total message count as lag. # The actual message count may be lower due to compaction # and record deletions. lag = high - low else: lag = high - partition.offset return { "topic_name": partition.topic, "partition_id": partition.partition, "high": high, "low": low, "lag": lag, "offset": partition.offset }
def __init__(self, topic, group, que_len=180): self.topic = topic self.conf = { 'bootstrap.servers': 'localhost:9092', 'group.id': group, 'enable.auto.commit': True, } # the application needs a maximum of 180 data units self.data = { 'time': deque(maxlen=que_len), 'Latitude': deque(maxlen=que_len), 'Longitude': deque(maxlen=que_len), 'Altitude': deque(maxlen=que_len) } consumer = Consumer(self.conf) consumer.subscribe([self.topic]) # download first 180 messges self.partition = TopicPartition(topic=self.topic, partition=0) low_offset, high_offset = consumer.get_watermark_offsets( self.partition) # move offset back on 180 messages if high_offset > que_len: self.partition.offset = high_offset - que_len else: self.partition.offset = low_offset # set the moved offset to consumer consumer.assign([self.partition]) self.__update_que(consumer)
def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.poll() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.consume() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unassign() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assignment() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.commit() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert 'Consumer closed' == str(ex.value)
class KafkaQueryConsumer: """ Wraps Kafka library consumer methods which query the broker for metadata and poll for single messages. It is a thin wrapper but allows a fake to be used in unit tests. """ def __init__(self, broker: str): # Set "enable.auto.commit" to False, as we do not need to report to the # kafka broker where we got to (it usually does this in case of a # crash, but we simply restart the process and go and find the last # run_start message. # # Set "queued.min.messages" to 1 as we will consume backwards through # the partition one message at a time; we do not want to retrieve # multiple messages in the forward direction each time we step # backwards by 1 offset conf = { "bootstrap.servers": broker, "group.id": "consumer_group_name", "auto.offset.reset": "latest", "enable.auto.commit": False, "queued.min.messages": 1 } self._consumer = Consumer(**conf) def get_topic_partitions(self, topic: str, offset: int = -1): metadata = self._consumer.list_topics(topic) return [ TopicPartition(topic, partition[1].id, offset=offset) for partition in metadata.topics[topic].partitions.items() ] def seek(self, partition: TopicPartition): """ Set offset in partition, the consumer will seek to that offset """ self._consumer.seek(partition) def poll(self, timeout=2.): """ Poll for a message from Kafka """ return self._consumer.poll(timeout=timeout) def get_watermark_offsets(self, partition: TopicPartition) -> Tuple[int, int]: """ Get the offset of the first and last available message in the given partition """ return self._consumer.get_watermark_offsets(partition, cached=False) def assign(self, partitions: List[TopicPartition]): self._consumer.assign(partitions) def offsets_for_times(self, partitions: List[TopicPartition]): return self._consumer.offsets_for_times(partitions)
def consume_everything(topic): consumer = Consumer({ "bootstrap.servers": "localhost:9092", "group.id": uuid.uuid4() }) topicpart = TopicPartition(topic, 0, 0) consumer.assign([topicpart]) low, high = consumer.get_watermark_offsets(topicpart) return consumer.consume(high - 1)
def poll_everything(topic): consumer = Consumer({ 'bootstrap.servers': 'localhost:9092', 'group.id': uuid.uuid4() }) topicpart = TopicPartition(topic, 0, 0) consumer.assign([topicpart]) low, high = consumer.get_watermark_offsets(topicpart) return consumer.consume(high - 1)
def count_messages(bootstrap_servers): c = Consumer({'bootstrap.servers': bootstrap_servers, 'group.id': 'group2', 'enable.auto.commit': False, 'auto.offset.reset': 'beginning'}) metadata = c.list_topics() topics = metadata.topics for topic, topicMetadata in topics.items(): for partition in topicMetadata.partitions: (low, high) = c.get_watermark_offsets(TopicPartition(topic, partition)) print(f"{topic} {partition}: {high}")
def initialize_from_kafka(self, kafka_topic: str, kafka_config: Dict[str, Any]) -> None: """ kafka_topic should have type str TODO: this method does not fail if client can't connect to host. """ if not kafka_topic: return print("Fetching state from kafka topic: {}".format(kafka_topic), file=sys.stderr) def fail_fast(err: Any, _msg: Any) -> None: if err: raise KafkaException(err) conf = kafka_config.copy() conf.update({ "group.id": "dummy_init_group", # should never be committed "enable.auto.commit": False, "auto.offset.reset": "earliest", "session.timeout.ms": 10000, }) consumer = Consumer(conf) # this watermark fetch is mostly to ensure we are connected to broker and # fail fast if not, but we also confirm that we read to end below. hwm = consumer.get_watermark_offsets(TopicPartition(kafka_topic, 0), timeout=5.0, cached=False) if not hwm: raise Exception( "Kafka consumer timeout, or topic {} doesn't exist".format( kafka_topic)) consumer.assign([TopicPartition(kafka_topic, 0, 0)]) c = 0 while True: msg = consumer.poll(timeout=2.0) if not msg: break if msg.error(): raise KafkaException(msg.error()) # sys.stdout.write('.') self.update(msg.value().decode("utf-8")) c += 1 consumer.close() # verify that we got at least to HWM assert c >= hwm[1] print("... got {} state update messages, done".format(c), file=sys.stderr)
def get_kafka_old_offset(topic, kafka_broker, partition_count): '''获取kafka 最旧的offset 用来跟后面的 batch_loader 所读取到的offset 做对比''' kafka_old_offset = {} kafka_new_offset = {} try: #使用kafka 库来获取的方式 ''' from kafka import SimpleClient from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy from kafka.common import OffsetRequestPayload client = SimpleClient(broker_list) partitions = client.topic_partitions[topic] offset_requests = [OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys()] offsets_responses = client.send_offset_request(offset_requests) for r in offsets_responses: #print("partition = %s, offset = %s"%(r.partition, r.offsets[0])) kafka_old_offset[r.partition] = r.offsets[0] ''' from confluent_kafka import TopicPartition, Consumer, KafkaException from confluent_kafka.admin import AdminClient conf = {'bootstrap.servers': kafka_broker, 'session.timeout.ms': 6000} try: admin_client = AdminClient(conf) consumer_client = Consumer(conf) md = admin_client.list_topics(timeout=10) for t in iter(md.topics.values()): if str(t) == topic: for p in iter(t.partitions.values()): td = TopicPartition(str(t), p.id) oldest_offset, newest_offset = consumer_client.get_watermark_offsets( td) kafka_old_offset[p.id] = oldest_offset kafka_new_offset[p.id] = newest_offset except KafkaException as e: logger.error("请检查kafka是否存活:%s" % e) except ImportError: for partition_id in range(partition_count): command = 'kafka-run-class kafka.tools.GetOffsetShell --topic %s --broker-list %s --time -2 --partition %d' % ( topic, kafka_broker, partition_id) args = shlex.split(command) process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = '{}'.format( process.stdout.read().decode(encoding='UTF-8')) offset = output.split(':')[2] kafka_old_offset[partition_id] = int(offset) return kafka_old_offset
class KafkaConsumer: def __init__(self, conf, group_id='kafka-rest-service'): conf = dict(conf) conf['group.id'] = group_id self.consumer = Consumer(conf) # @cached(cache=TTLCache(maxsize=1024, ttl=60)) def get_topic_partition_count(self, topic_name): cmd = self.consumer.list_topics(topic_name) tmd = cmd.topics.get(topic_name, None) pcount = 0 if tmd: pcount = len(tmd.partitions) return pcount # @cached(cache=TTLCache(maxsize=1024, ttl=60)) def get_topic_offsets(self, topic_name): pcount = self.get_topic_partition_count(topic_name) if pcount == 0: return dict(error=f"Requested topic {topic_name} not found", status="ERROR", report=None) part_status_map = {} for p in range(pcount): l, h = self.consumer.get_watermark_offsets( TopicPartition(topic_name, p)) part_status_map[p] = [h, '1 month'] def get_minute_report(minute, time_text): timestamp = (datetime.now() - timedelta(minutes=minute)).timestamp() timestamp = int(timestamp) * 1000 partitions = [ TopicPartition(topic_name, p, timestamp) for p in range(pcount) ] partitions = self.consumer.offsets_for_times(partitions) for par in partitions: if par.offset > -1: part_status_map[par.partition][-1] = time_text get_minute_report(60 * 24 * 7, '1 week') get_minute_report(60 * 24, '1 day') get_minute_report(60, '1 hour') get_minute_report(10, '10 minutes') get_minute_report(1, '1 minute') part_status_map = {k: list(v) for k, v in part_status_map.items()} return dict(error=None, status="SUCCESS", topic=topic_name, offsets=part_status_map)
def morning_notice(): # 每只股票都创建 1 个 topic,包含 5 个 partition,partition 0 存放 futu 获取的 snapshot,partition 1 存放 futu 的 实时报价,partition 2 存放 futu 的实时 K线,partition 3 存放 futu 的实时 分时, # partition 4 存放 futu 的实时 逐比,partition 5 存放 futu 的实时摆盘,partition 6 存放 futu 的实时经纪队列,partition 7-9 暂时空闲 consumer = Consumer({ 'bootstrap.servers': 'kafka01', 'group.id': 'test', 'enable.auto.commit': False, 'default.topic.config': { 'auto.offset.reset': 'largest' } }) (rise_ratio_list_smallest, rise_ratio_list_largest) = consumer.get_watermark_offsets( TopicPartition('test', 0)) (volume_list_smallest, volume_list_largest) = consumer.get_watermark_offsets( TopicPartition('test', 1)) try: consumer.assign( [TopicPartition('test', 0, rise_ratio_list_largest - 1)]) consumer.seek(TopicPartition('test', 0, rise_ratio_list_largest - 1)) print(consumer.position([TopicPartition('test', 0)])) print(consumer.position([TopicPartition('test', 1)])) latest_rise_ratio = consumer.poll(3.0) print(consumer.position([TopicPartition('test', 0)])) print(consumer.position([TopicPartition('test', 1)])) print(latest_rise_ratio) consumer.assign([TopicPartition('test', 1, volume_list_largest - 1)]) consumer.seek(TopicPartition('test', 1, volume_list_largest - 1)) print(consumer.position([TopicPartition('test', 0)])) print(consumer.position([TopicPartition('test', 1)])) latest_volume = consumer.poll(3.0).value() print(consumer.position([TopicPartition('test', 0)])) print(consumer.position([TopicPartition('test', 1)])) print(latest_volume) finally: consumer.close()
def get_kafka_old_offset(topic, kafka_broker, partition_count): kafka_old_offset = {} #kafka_new_offset = {} try: #Get kafka offset through kafka module ''' from kafka import SimpleClient from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy from kafka.common import OffsetRequestPayload client = SimpleClient(broker_list) partitions = client.topic_partitions[topic] offset_requests = [OffsetRequestPayload(topic, p, -1, 1) for p in partitions.keys()] offsets_responses = client.send_offset_request(offset_requests) for r in offsets_responses: #print("partition = %s, offset = %s"%(r.partition, r.offsets[0])) kafka_old_offset[r.partition] = r.offsets[0] ''' # Get kafka offset through confluent_kafka module from confluent_kafka import TopicPartition, Consumer, KafkaException from confluent_kafka.admin import AdminClient conf = {'bootstrap.servers': kafka_broker, 'session.timeout.ms': 6000} admin_client = AdminClient(conf) consumer_client = Consumer(conf) md = admin_client.list_topics(timeout=10) for t in iter(md.topics.values()): if str(t) == topic: for p in iter(t.partitions.values()): td = TopicPartition(str(t), p.id) oldest_offset, newest_offset = consumer_client.get_watermark_offsets( td) kafka_old_offset[p.id] = oldest_offset #kafka_new_offset[p.id] = newest_offset except ImportError: for partition_id in range(partition_count): command = 'kafka-run-class kafka.tools.GetOffsetShell --topic {} --broker-list {} --time -2 --partition {}'.format( topic, kafka_broker, partition_id) #args = shlex.split(command) #process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #output = '{}'.format(process.stdout.read().decode(encoding='UTF-8')) output = utils.shell_wrapper.check_output(command) offset = output.split(':')[2] kafka_old_offset[partition_id] = int(offset) return kafka_old_offset
def get_last_available_status_message(cons: Consumer, status_topic: str): """ :param cons: :param status_topic: :return: The last status message. """ partitions = cons.assignment() _, hi = cons.get_watermark_offsets(partitions[0], cached=False, timeout=2.0) last_msg_offset = hi - 1 cons.assign( [TopicPartition(status_topic, partition=0, offset=last_msg_offset)]) status_msg, _ = poll_for_valid_message(cons, expected_file_identifier=None) return status_msg
def get_all_available_messages(consumer: Consumer): """ Consumes all available messages topics subscribed to by the consumer :param consumer: The consumer object :return: list of messages, empty if none available """ messages = [] low_offset, high_offset = consumer.get_watermark_offsets( consumer.assignment()[0], cached=False) number_of_messages_available = high_offset - low_offset while len(messages) < number_of_messages_available: message = consumer.poll(timeout=2.0) if message is None or message.error(): continue messages.append(message) return messages
def most_recent_message(topic, kafka_config): """ Tries to fetch the most recent message from a given topic. This only makes sense for single partition topics (it works with only the first partition), though could be extended with "last N" behavior. """ print("Fetching most Kafka message from {}".format(topic)) conf = kafka_config.copy() conf.update({ 'group.id': 'worker-init-last-msg', # should never commit 'delivery.report.only.error': True, 'enable.auto.commit': False, 'default.topic.config': { 'request.required.acks': -1, 'auto.offset.reset': 'latest', }, }) consumer = Consumer(conf) hwm = consumer.get_watermark_offsets(TopicPartition(topic, 0), timeout=5.0, cached=False) if not hwm: raise Exception( "Kafka consumer timeout, or topic {} doesn't exist".format(topic)) print("High watermarks: {}".format(hwm)) if hwm[1] == 0: print("topic is new; not 'most recent message'") return None consumer.assign([TopicPartition(topic, 0, hwm[1] - 1)]) msg = consumer.poll(2.0) consumer.close() if not msg: raise Exception("Failed to fetch most recent kafka message") if msg.error(): raise KafkaException(msg.error()) return msg.value()
class KafkaClient(object): def __init__(self, kafka_bootstrap_servers, kafka_topic, guid=None, partition=None): self.kafka_bootstrap_servers = kafka_bootstrap_servers self.kafka_topic = kafka_topic if partition: raise NotImplementedError("multiple partitions not supported yet") self.guid = guid if not self.guid: self.guid = str(uuid4()) self.p = None self.c = None def produce(self, key, val): try: if not self.p: self.p = Producer({ 'bootstrap.servers': self.kafka_bootstrap_servers, 'api.version.request': True }) if not isinstance(key, bytes): raise TypeError( 'producing to kafka requires key to be raw bytes') if not isinstance(val, bytes) and val is not None: raise TypeError( 'producing to kafka requires val to be raw bytes or None') self.p.produce(topic=self.kafka_topic, value=val, key=key) except BufferError: self.p.flush() self.p.produce(topic=self.kafka_topic, value=val, key=key) def flush_producer(self): if self.p: self.p.flush() def consume(self): if not self.c: self.c = Consumer({ 'bootstrap.servers': self.kafka_bootstrap_servers, 'group.id': self.guid, 'api.version.request': True, 'log.connection.close': False, 'socket.keepalive.enable': True, 'session.timeout.ms': 6000, 'default.topic.config': { 'auto.offset.reset': 'smallest' } }) self.c.subscribe([self.kafka_topic]) # must perform an initial poll to get partition assignments first_message = True msg = self.c.poll(timeout=10.0) # grab watermarks from partition partitionobjs = self.c.assignment() partitions = {} for prt in partitionobjs: partition = prt.partition last_offset = self.c.get_watermark_offsets(prt)[1] - 1 if last_offset < 0: # if nothing in partition then this will be -1 continue position = max( self.c.position([prt])[0].offset - 1, -1 ) # if never read before then call returns -1001 for some reason if last_offset > position: partitions[partition] = last_offset # process partitions up to watermarks (but remember that we already consumed a message, so need to yield that) while first_message or len(partitions) > 0: if not first_message: msg = self.c.poll(timeout=10.0) else: first_message = False if msg is None or msg.error( ): # NOTE: "if not msg" checks if message len = 0, which is different from checking "if msg is None" continue # ignore errors partition = msg.partition() if partition in partitions and msg.offset() >= partitions[ partition]: # first check is because we might read past the watermark # for a partition that we're already done with... but that's ok del partitions[partition] yield msg.key(), msg.value(), msg.timestamp()[1] def __del__(self): self.flush_producer() if self.c: self.c.close()
class TimeOrderedGeneratorWithTimeout(GeneratorInterface): """ A general generator which can read multiple topics and merge their messages in time order. A message must be emitted at (arrival_system_time + latency_ms). In batch mode (until reaching the first EOP on each stream) the generator will not discard any messages. """ def __init__(self, broker, groupid, topics_infos: List[TopicInfo], latency_ms, commit_interval_sec=None, group_by_time=False, begin_timestamp=None, begin_flag=None, end_timestamp=None, end_flag=None, heartbeat_interval_ms=-1): """ :param broker: Broker to connect to. :param groupid: Group id of the consumer. :param topics_infos: [TopicInfo()] - list of TopicInfo objects. :param latency_ms: (integer >=0) Latency to wait before serving a message. After this messages with lower or equal timestamps will be discarded. :param commit_interval_sec: How many seconds to wait between commits.-1 does not commit with the given group id. :param group_by_time: Group messages with the same timestamp. This will yield a list of messages. :param begin_timestamp: Timestamp of the kafka messages where the generator will start. :param begin_flag: BEGINNING, CONTINUE, LIVE - CONTINUE will continue from the last committed offset. If there was no committed offset will start from the end of the stream. :param end_timestamp: Timestamp where to end the reading. :param end_flag: NEVER, END_OF_PARTITION :param heartbeat_interval_ms: -1 does not produce heartbeat. After every interval will produce a HeartBeat typed message with the timestamp. """ if begin_timestamp is not None and begin_flag is not None: raise Exception( 'You can not set the begin timestamp and a flag in the same time.' ) if end_timestamp is not None and end_flag is not None: raise Exception( 'You can not set the end timestamp and a flag in the same time.' ) if begin_timestamp is not None and end_timestamp is not None and begin_timestamp >= end_timestamp: raise Exception( 'The begin timestamp is larger then the end timestamp.') if begin_flag is not None and end_flag is not None and \ begin_flag == BeginFlag.LIVE and end_flag == EndFlag.END_OF_PARTITION: raise Exception( 'You can not start in live and process until the end of the streams.' ) if end_flag is not None and not (end_flag == EndFlag.END_OF_PARTITION or end_flag == EndFlag.NEVER): raise Exception( 'Unknow end flag: {} . Please use the given enum to use proper end flag.' .format(end_flag)) self.end_ts = end_timestamp self.end_flag = end_flag self.commit_interval_sec = commit_interval_sec self.latency_ms = latency_ms self.group_by_time = group_by_time self.consumer = Consumer({ 'bootstrap.servers': broker, 'group.id': groupid, 'enable.auto.commit': False, 'auto.offset.reset': 'latest', 'enable.partition.eof': True, 'fetch.wait.max.ms': 50 }) self.tps = [] self.queues = {} self.messages_to_be_committed = {} self.begin_timestamp = begin_timestamp for ti in topics_infos: topic_name = ti.topic self.messages_to_be_committed[topic_name] = { 'last_msg': None, 'committed': True } if begin_timestamp is not None: self.tps.extend( self.consumer.offsets_for_times([ TopicPartition(topic_name, partition=ti.partition, offset=begin_timestamp) ])) elif begin_flag is not None: if begin_flag == BeginFlag.BEGINNING: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_BEGINNING)) elif begin_flag == BeginFlag.CONTINUE: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_STORED)) elif begin_flag == BeginFlag.LIVE: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_END)) else: raise Exception( 'Unknown begin flag. Please use the enum to provide proper begin flag.' ) else: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_END)) end_offset = None if end_flag is not None and end_flag == EndFlag.END_OF_PARTITION: end_offset = self.consumer.get_watermark_offsets( TopicPartition(topic_name, 0))[1] - 1 if end_offset is None or end_offset >= 0: self.queues[topic_name] = Topic(topic_name, self.consumer, end_offset=end_offset, partition=ti.partition, drop=ti.drop) self.consumer.assign(self.tps) self.last_commit = time.time() self.running = True self.heartbeat_interval_ms = heartbeat_interval_ms self.next_hb = None def stopGenerator(self): self.running = False def _serve_messages(self, message_to_serve): if self.commit_interval_sec is not None and self.group_by_time: for msg in message_to_serve: self.messages_to_be_committed[msg.topic()]['last_msg'] = msg self.messages_to_be_committed[msg.topic()]['committed'] = False # serve messages if self.group_by_time: yield message_to_serve else: for msg in message_to_serve: self.messages_to_be_committed[msg.topic()]['last_msg'] = msg self.messages_to_be_committed[msg.topic()]['committed'] = False yield msg if not self.running: break # commit messages when they were delivered current_time = time.time() if self.commit_interval_sec is not None and ( current_time - self.last_commit) > self.commit_interval_sec: for k in self.messages_to_be_committed.keys(): if not self.messages_to_be_committed[k]['committed']: self.consumer.commit( self.messages_to_be_committed[k]['last_msg']) self.messages_to_be_committed[k]['committed'] = True self.last_commit = current_time def _serve_heartbeat(self, current_timestamp_ms): if self.next_hb is None: if self.begin_timestamp is not None: self.next_hb = self.begin_timestamp else: self.next_hb = current_timestamp_ms while self.next_hb <= current_timestamp_ms: yield HeartBeat(self.next_hb) self.next_hb += self.heartbeat_interval_ms def _can_serve(self): min_ets = min([ q.queue[0].message.timestamp()[1] for q in self.queues.values() if len(q.queue) > 0 ], default=-1) if min_ets == -1: return None deadline = getSystemTimestamp() - self.latency_ms if all([q.can_be_emitted(min_ets) for q in self.queues.values()]) and \ any([q.queue[0].ts < deadline for q in self.queues.values() if len(q.queue) > 0 and q.queue[0].message.timestamp()[1] == min_ets]): return min_ets else: return None def getMessages(self): while self.running: if all([v.stopped for v in self.queues.values()]): message_to_serve = [] for q in self.queues.values(): message_to_serve.extend(q.queue) message_to_serve = [m.message for m in message_to_serve] message_to_serve.sort(key=lambda x: x.timestamp()[1]) while len(message_to_serve) > 0: ts = message_to_serve[0].timestamp()[1] serve_it = [] while len(message_to_serve) > 0 and message_to_serve[ 0].timestamp()[1] == ts: serve_it.append(message_to_serve.pop(0)) if not self.heartbeat_interval_ms == -1: yield from self._serve_heartbeat(ts) yield from self._serve_messages(serve_it) logging.debug('Exiting from generator.') break msg = self.consumer.poll(0.001) if msg is not None: if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: if msg.topic() in self.queues: self.queues[msg.topic()].first_eop_reached = True self.queues[msg.topic()].end_of_partition = True else: logging.error('Unhandle error: {}'.format(msg.error())) break else: self.queues[msg.topic()].end_of_partition = False if self.end_ts is not None and msg.timestamp( )[1] > self.end_ts: self.queues[msg.topic()].stop_topic() else: self.queues[msg.topic()].add_message(msg) while self.running: event_ts_to_serve = self._can_serve() if event_ts_to_serve is None: if self.end_flag == EndFlag.NEVER and self.heartbeat_interval_ms != -1 \ and any([q.end_of_partition for q in self.queues.values()]): if self.next_hb is None: self.next_hb = getSystemTimestamp( ) - self.latency_ms yield from self._serve_heartbeat(getSystemTimestamp() - self.latency_ms) break if self.heartbeat_interval_ms != -1: yield from self._serve_heartbeat(event_ts_to_serve) message_to_serve = [] for q in self.queues.values(): message_to_serve.extend(q.get_messages(event_ts_to_serve)) yield from self._serve_messages(message_to_serve) if self.end_ts is not None and self.end_ts <= event_ts_to_serve: self.running = False self.consumer.close()
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb (err, partitions): pass kc = Consumer({'group.id':'test', 'socket.timeout.ms':'100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke (consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) partitions = list(map(lambda p: TopicPartition("test", p), range(0,100,3))) kc.assign(partitions) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE), str(e.args([0])) kc.unassign() kc.commit(async=True) try: kc.commit(async=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: offsets = kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT kc.close()
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({ 'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb }) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list( map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
class KafkaHandler(BaseHandler[KafkaHandlerConfig]): config_cls = KafkaHandlerConfig _eof_reached: Dict[int, bool] OFFSET_AT_FIRST_MESSAGE = OFFSET_BEGINNING OFFSET_AFTER_LAST_MESSAGE = OFFSET_END # hopefully this number won't get assigned any semantics by the Kafka Devs any time soon OFFSET_AT_LAST_MESSAGE = -101 def __init__(self, config: KafkaHandlerConfig): super().__init__(config) self._assignment_created = False self._seek = OFFSET_BEGINNING self._high_watermarks: Dict[int, int] = {} self._consumer: Optional[Consumer] = None self._producer: Optional[Producer] = None self._errors: List[KafkaError] = [] def _get_producer(self) -> Producer: if self._producer is not None: return self._producer config_instance = esque_config.Config() with config_instance.temporary_context(self.config.esque_context): self._producer = Producer( config_instance.create_confluent_config( include_schema_registry=False)) return self._producer def _get_consumer(self) -> Consumer: if self._consumer is not None: return self._consumer config_instance = esque_config.Config() with config_instance.temporary_context(self.config.esque_context): group_id = self.config.consumer_group_id self._consumer = Consumer({ "group.id": group_id, "enable.partition.eof": True, "enable.auto.commit": False, **config_instance.create_confluent_config(include_schema_registry=False), }) topic_metadata: TopicMetadata = self._consumer.list_topics( self.config.topic_name).topics[self.config.topic_name] if topic_metadata.error is not None: raise EsqueIOHandlerReadException( f"Topic {self.config.topic_name!r} not found.") self._eof_reached = { partition_id: False for partition_id in topic_metadata.partitions.keys() } for partition_id in topic_metadata.partitions.keys(): self._high_watermarks[ partition_id] = self._consumer.get_watermark_offsets( TopicPartition(topic=self.config.topic_name, partition=partition_id))[1] return self._consumer def get_serializer_configs(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: raise EsqueIOSerializerConfigNotSupported def put_serializer_configs( self, config: Tuple[Dict[str, Any], Dict[str, Any]]) -> None: raise EsqueIOSerializerConfigNotSupported def write_message( self, binary_message: Union[BinaryMessage, StreamEvent]) -> None: self._produce_single_message(binary_message=binary_message) self._flush() def write_many_messages( self, message_stream: Iterable[Union[BinaryMessage, StreamEvent]]) -> None: for binary_message in message_stream: self._produce_single_message(binary_message=binary_message) self._flush() def _produce_single_message(self, binary_message: BinaryMessage) -> None: if isinstance(binary_message, StreamEvent): return partition_arg = {} partition = self._io_to_confluent_partition(binary_message.partition) if partition is not None: partition_arg["partition"] = partition self._get_producer().produce( topic=self.config.topic_name, value=binary_message.value, key=binary_message.key, headers=self._io_to_confluent_headers(binary_message.headers), timestamp=self._io_to_confluent_timestamp( binary_message.timestamp), on_delivery=self._delivery_callback, **partition_arg, ) def _delivery_callback(self, err: Optional[KafkaError], msg: str): if err is None: return self._errors.append(err) def _flush(self): self._get_producer().flush() if self._errors: exception = EsqueIOHandlerWriteException( "The following exception(s) occurred while writing to Kafka:\n " + "\n ".join(map(str, self._errors))) self._errors.clear() raise exception @staticmethod def _io_to_confluent_partition(partition: int) -> Optional[int]: # TODO: introduce something like the config.send_timestamp flag to make it possible to always return None here. # This would allow for moving messages between topics with different amounts of partitions without making them # unbalanced. if partition < 0: return None return partition def _io_to_confluent_timestamp(self, message_ts: datetime.datetime): return int(message_ts.timestamp() * 1000) if self.config.send_timestamp else 0 @staticmethod def _io_to_confluent_headers( headers: List[MessageHeader] ) -> Optional[List[Tuple[str, Optional[bytes]]]]: if not headers: return None confluent_headers: List[Tuple[str, Optional[bytes]]] = [] for header in headers: key = header.key if header.value is not None: value = header.value.encode("utf-8") else: value = None confluent_headers.append((key, value)) return confluent_headers def read_message(self) -> Union[BinaryMessage, StreamEvent]: if not self._assignment_created: self._assign() consumed_message: Optional[Message] = None while consumed_message is None: consumed_message = self._get_consumer().poll(timeout=0.1) if consumed_message is None and all(self._eof_reached.values()): return TemporaryEndOfPartition( "Reached end of all partitions", partition=EndOfStream.ALL_PARTITIONS) # TODO: process other error cases (connection issues etc.) if consumed_message.error() is not None and consumed_message.error( ).code() == KafkaError._PARTITION_EOF: self._eof_reached[consumed_message.partition()] = True return TemporaryEndOfPartition( "Reached end of partition", partition=consumed_message.partition()) else: self._eof_reached[consumed_message.partition()] = False binary_message = self._confluent_to_binary_message( consumed_message) return binary_message def _confluent_to_binary_message( self, consumed_message: Message) -> BinaryMessage: binary_message = BinaryMessage( key=consumed_message.key(), value=consumed_message.value(), partition=consumed_message.partition(), offset=consumed_message.offset(), timestamp=self._confluent_to_io_timestamp(consumed_message), headers=self._confluent_to_io_headers(consumed_message.headers()), ) return binary_message @staticmethod def _confluent_to_io_timestamp( consumed_message: Message) -> datetime.datetime: return datetime.datetime.fromtimestamp( consumed_message.timestamp()[1] / 1000, tz=datetime.timezone.utc) @staticmethod def _confluent_to_io_headers( confluent_headers: Optional[List[Tuple[str, Optional[bytes]]]] ) -> List[MessageHeader]: io_headers: List[MessageHeader] = [] if confluent_headers is None: return io_headers for confluent_header in confluent_headers: key, value = confluent_header if value is not None: value = value.decode("utf-8") io_headers.append(MessageHeader(key, value)) return io_headers def message_stream(self) -> Iterable[Union[BinaryMessage, StreamEvent]]: while True: yield self.read_message() def seek(self, position: int) -> None: self._seek = position def _assign(self) -> None: self._assignment_created = True if self._seek == self.OFFSET_AT_LAST_MESSAGE: self._get_consumer().assign([ TopicPartition(topic=self.config.topic_name, partition=partition_id, offset=high_watermark - 1) for partition_id, high_watermark in self._high_watermarks.items() ]) else: self._get_consumer().assign([ TopicPartition(topic=self.config.topic_name, partition=partition_id, offset=self._seek) for partition_id in self._eof_reached.keys() ]) def close(self) -> None: if self._consumer is not None: self._consumer.close() self._consumer = None if self._producer is not None: self._producer.flush() self._producer = None
def get_last_n_messages( self, n: int) -> Optional[List[Tuple[datetime.datetime, Dict]]]: ''' Returns the last n published timestamps and messages or None, if no message has been published yet. If the configured topic has more than one partition, you will receive more messages than requested (at most partitions * n). You might receive less messages than requested, if the broker has cleared messages. :return: List of tuples with timestamp and message or None if no message has been published yet ''' consumer = Consumer({ 'bootstrap.servers': self.__kafka_bootstrap, 'group.id': self.__import_id }) partitions = consumer.list_topics(topic=self.__kafka_topic).topics[ self.__kafka_topic].partitions.keys() self.__logger.debug("Found " + str(len(partitions)) + " partition(s) of topic " + self.__kafka_topic) num_messages = 0 topic_partitions = [] for partition in partitions: high_low_offset = consumer.get_watermark_offsets( cimpl.TopicPartition(self.__kafka_topic, partition=partition)) high_offset = high_low_offset[1] low_offset = high_low_offset[0] available_messages = high_offset - low_offset self.__logger.debug("Low/High offset of partition " + str(partition) + " is " + str(low_offset) + "/" + str(high_offset)) if high_offset > 0: # Ignore partitions without data if available_messages >= n: offset = high_offset - n num_messages += n else: offset = low_offset num_messages += available_messages partition = cimpl.TopicPartition(self.__kafka_topic, partition=partition, offset=offset) topic_partitions.append(partition) self.__logger.debug("Setting offset of partition " + str(partition)) if len(topic_partitions) == 0: # No partition has any data return None consumer.assign(topic_partitions) consumer.commit(offsets=topic_partitions) tuples = [] consumed_messages = 0 batch_size = 10000 self.__logger.debug("Consuming last " + str(num_messages) + " message(s)") while consumed_messages < num_messages: if consumed_messages + batch_size <= num_messages: to_consume = batch_size else: to_consume = num_messages - consumed_messages consumed_messages += to_consume self.__logger.debug("Consuming batch of " + str(to_consume) + " messages") msgs = consumer.consume(num_messages=to_consume, timeout=30) for msg in msgs: value = json.loads(msg.value()) if 'time' not in value: self.__logger.warning( "time field missing in message, is someone else using this topic? Ignoring " "message") continue if 'value' not in value or not isinstance( value['value'], Dict): self.__logger.warning( "value field missing or malformed in message, is someone else using this topic? " "Ignoring message") continue try: date_time = datetime.datetime.strptime( value["time"], "%Y-%m-%dT%H:%M:%SZ") except ValueError: self.__logger.warning( "time field not in rfc3339 format, is someone else using this topic? Ignoring " "message") continue tuples.append((date_time, value["value"])) consumer.close() return tuples
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
'auto.offset.reset': 'smallest' } }) # 消费整个 topic # subscribe和assign是不能同时使用的。subscribe表示订阅topic,从kafka记录的offset开始消费。assign表示从指定的offset开始消费。 consumer.subscribe(['test']) # 消费 topic 里某一个或几个特定的 partition consumer.assign([TopicPartition('test', 4)]) # 重置 offset consumer.assign([TopicPartition('test', 4, 2)]) # 获取一个 partition 的最小、最大 offset consumer.get_watermark_offsets(TopicPartition('test', 4)) # (0, 19) # 如果是一个新的 group.id 必须先消费一条消息,这样后面的重置 offset 才有效, 如果不消费,重置 offset 前后获取到的 offset 值都是-1001 # 获取当前 offset 位置 consumer.position([TopicPartition('test', 3)]) # 重置 offset 到任意位置,committed 决定了下一次连接后的 offset 位置(以 group 为维度),本次连接无效。本次连接的 offset 位置由 position 决定。 # 重置 offset 后,要 close 重新连才有效。position 决定本次连接的 offset 位置,用 seek() 修改。 consumer.seek(TopicPartition('test', 3, 1)) consumer.commit(offsets=[TopicPartition('test', 3, 7)]) # 检查重置的位置 msg = consumer.committed([TopicPartition('test', 3)]) print(msg)
if len(argv) > 1 and argv[1] == "global": pfile_name = argv[0] option = argv[1] p_config = producer_global # default option for producer c_config = consumer_global # default option for consumer topic = global_topic # Kafka Producer p = Producer(p_config) # Kafka Consumer c = Consumer(c_config) c.subscribe([topic]) low, high = c.get_watermark_offsets(TopicPartition(topic, partition=0)) print("low offset: ", low) print("high offset: ", high) c.assign([TopicPartition(topic, partition=0, offset=high)]) while True: msg = c.poll(1.0) if msg is None: continue if msg.error(): print("Consumer error: {}".format(msg.error())) continue if msg.key() is None:
class KafkaConsumer(Consumer[TPayload]): """ The behavior of this consumer differs slightly from the Confluent consumer during rebalancing operations. Whenever a partition is assigned to this consumer, offsets are *always* automatically reset to the committed offset for that partition (or if no offsets have been committed for that partition, the offset is reset in accordance with the ``auto.offset.reset`` configuration value.) This causes partitions that are maintained across a rebalance to have the same offset management behavior as a partition that is moved from one consumer to another. To prevent uncommitted messages from being consumed multiple times, ``commit`` should be called in the partition revocation callback. The behavior of ``auto.offset.reset`` also differs slightly from the Confluent consumer as well: offsets are only reset during initial assignment or subsequent rebalancing operations. Any other circumstances that would otherwise lead to preemptive offset reset (e.g. the consumer tries to read a message that is before the earliest offset, or the consumer attempts to read a message that is after the latest offset) will cause an exception to be thrown, rather than resetting the offset, as this could lead to chunks messages being replayed or skipped, depending on the circumstances. This also means that if the committed offset is no longer available (such as when reading older messages from the log and those messages expire, or reading newer messages from the log and the leader crashes and partition ownership fails over to an out-of-date replica), the consumer will fail-stop rather than reset to the value of ``auto.offset.reset``. """ # Set of logical offsets that do not correspond to actual log positions. # These offsets should be considered an implementation detail of the Kafka # consumer and not used publically. # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25 LOGICAL_OFFSETS = frozenset( [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID]) def __init__( self, configuration: Mapping[str, Any], codec: Codec[KafkaPayload, TPayload], *, commit_retry_policy: Optional[RetryPolicy] = None, ) -> None: if commit_retry_policy is None: commit_retry_policy = NoRetryPolicy() auto_offset_reset = configuration.get("auto.offset.reset", "largest") if auto_offset_reset in {"smallest", "earliest", "beginning"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_earliest) elif auto_offset_reset in {"largest", "latest", "end"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_latest) elif auto_offset_reset == "error": self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_error) else: raise ValueError( "invalid value for 'auto.offset.reset' configuration") if (as_kafka_configuration_bool( configuration.get("enable.auto.commit", "true")) is not False): raise ValueError( "invalid value for 'enable.auto.commit' configuration") if (as_kafka_configuration_bool( configuration.get("enable.auto.offset.store", "true")) is not False): raise ValueError( "invalid value for 'enable.auto.offset.store' configuration") # NOTE: Offsets are explicitly managed as part of the assignment # callback, so preemptively resetting offsets is not enabled. self.__consumer = ConfluentConsumer({ **configuration, "auto.offset.reset": "error" }) self.__codec = codec self.__offsets: MutableMapping[Partition, int] = {} self.__staged_offsets: MutableMapping[Partition, int] = {} self.__paused: Set[Partition] = set() self.__commit_retry_policy = commit_retry_policy self.__state = KafkaConsumerState.CONSUMING def __resolve_partition_offset_earliest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, low) def __resolve_partition_offset_latest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, high) def __resolve_partition_offset_error( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: raise ConsumerError("unable to resolve partition offsets") def subscribe( self, topics: Sequence[Topic], on_assign: Optional[Callable[[Mapping[Partition, int]], None]] = None, on_revoke: Optional[Callable[[Sequence[Partition]], None]] = None, ) -> None: """ Subscribe to topics. This replaces a previous subscription. This method does not block. The subscription may not be fulfilled immediately: instead, the ``on_assign`` and ``on_revoke`` callbacks are called when the subscription state changes with the updated assignment for this consumer. If provided, the ``on_assign`` callback is called with a mapping of partitions to their offsets (at this point, the working offset and the committed offset are the same for each partition) on each subscription change. Similarly, the ``on_revoke`` callback (if provided) is called with a sequence of partitions that are being removed from this consumer's assignment. (This callback does not include the offsets, as the working offset and committed offset may differ, in some cases by substantial margin.) Raises an ``InvalidState`` exception if called on a closed consumer. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) def assignment_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.ASSIGNING try: assignment: MutableSequence[ConfluentTopicPartition] = [] for partition in self.__consumer.committed(partitions): if partition.offset >= 0: assignment.append(partition) elif partition.offset == OFFSET_INVALID: assignment.append( self.__resolve_partition_starting_offset( partition)) else: raise ValueError("received unexpected offset") offsets: MutableMapping[Partition, int] = { Partition(Topic(i.topic), i.partition): i.offset for i in assignment } self.__seek(offsets) # Ensure that all partitions are resumed on assignment to avoid # carrying over state from a previous assignment. self.__consumer.resume([ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in offsets.items() ]) for partition in offsets: self.__paused.discard(partition) except Exception: self.__state = KafkaConsumerState.ERROR raise try: if on_assign is not None: on_assign(offsets) finally: self.__state = KafkaConsumerState.CONSUMING def revocation_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.REVOKING partitions = [ Partition(Topic(i.topic), i.partition) for i in partitions ] try: if on_revoke is not None: on_revoke(partitions) finally: for partition in partitions: # Staged offsets are deleted during partition revocation to # prevent later committing offsets for partitions that are # no longer owned by this consumer. if partition in self.__staged_offsets: logger.warning( "Dropping staged offset for revoked partition (%r)!", partition, ) del self.__staged_offsets[partition] try: self.__offsets.pop(partition) except KeyError: # If there was an error during assignment, this # partition may have never been added to the offsets # mapping. logger.warning( "failed to delete offset for unknown partition: %r", partition, ) self.__paused.discard(partition) self.__state = KafkaConsumerState.CONSUMING self.__consumer.subscribe( [topic.name for topic in topics], on_assign=assignment_callback, on_revoke=revocation_callback, ) def unsubscribe(self) -> None: """ Unsubscribe from topics. Raises an ``InvalidState`` exception if called on a closed consumer. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) self.__consumer.unsubscribe() def poll(self, timeout: Optional[float] = None) -> Optional[Message[TPayload]]: """ Return the next message available to be consumed, if one is available. If no message is available, this method will block up to the ``timeout`` value before returning ``None``. A timeout of ``0.0`` represents "do not block", while a timeout of ``None`` represents "block until a message is available (or forever)". Calling this method may also invoke subscription state change callbacks. This method may also raise an ``EndOfPartition`` error (a subtype of ``ConsumerError``) when the consumer has reached the end of a partition that it is subscribed to and no additional messages are available. The ``partition`` attribute of the raised exception specifies the end which partition has been reached. (Since this consumer is multiplexing a set of partitions, this exception does not mean that *all* of the partitions that the consumer is subscribed to do not have any messages, just that it has reached the end of one of them. This also does not mean that additional messages won't be available in future poll calls.) Not every backend implementation supports this feature or is configured to raise in this scenario. Raises an ``InvalidState`` exception if called on a closed consumer. Raises a ``TransportError`` for various other consumption-related errors. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) message: Optional[ConfluentMessage] = self.__consumer.poll( *[timeout] if timeout is not None else []) if message is None: return None error: Optional[KafkaError] = message.error() if error is not None: code = error.code() if code == KafkaError._PARTITION_EOF: raise EndOfPartition( Partition(Topic(message.topic()), message.partition()), message.offset(), ) elif code == KafkaError._TRANSPORT: raise TransportError(str(error)) else: raise ConsumerError(str(error)) headers: Optional[Headers] = message.headers() result = Message( Partition(Topic(message.topic()), message.partition()), message.offset(), self.__codec.decode( KafkaPayload( message.key(), message.value(), headers if headers is not None else [], )), datetime.utcfromtimestamp(message.timestamp()[1] / 1000.0), ) self.__offsets[result.partition] = result.get_next_offset() return result def tell(self) -> Mapping[Partition, int]: """ Return the read offsets for all assigned partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) return self.__offsets def __validate_offsets(self, offsets: Mapping[Partition, int]) -> None: invalid_offsets: Mapping[Partition, int] = { partition: offset for partition, offset in offsets.items() if offset < 0 } if invalid_offsets: raise ConsumerError(f"invalid offsets: {invalid_offsets!r}") def __seek(self, offsets: Mapping[Partition, int]) -> None: self.__validate_offsets(offsets) if self.__state is KafkaConsumerState.ASSIGNING: # Calling ``seek`` on the Confluent consumer from an assignment # callback will throw an "Erroneous state" error. Instead, # partition offsets have to be initialized by calling ``assign``. self.__consumer.assign([ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in offsets.items() ]) else: for partition, offset in offsets.items(): self.__consumer.seek( ConfluentTopicPartition(partition.topic.name, partition.index, offset)) self.__offsets.update(offsets) def seek(self, offsets: Mapping[Partition, int]) -> None: """ Change the read offsets for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if offsets.keys() - self.__offsets.keys(): raise ConsumerError("cannot seek on unassigned partitions") self.__seek(offsets) def pause(self, partitions: Sequence[Partition]) -> None: """ Pause the consumption of messages for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if set(partitions) - self.__offsets.keys(): raise ConsumerError("cannot pause unassigned partitions") self.__consumer.pause([ ConfluentTopicPartition(partition.topic.name, partition.index) for partition in partitions ]) self.__paused.update(partitions) # XXX: Seeking to a specific partition offset and immediately pausing # that partition causes the seek to be ignored for some reason. self.seek({ partition: offset for partition, offset in self.__offsets.items() if partition in partitions }) def resume(self, partitions: Sequence[Partition]) -> None: """ Resume the consumption of messages for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if set(partitions) - self.__offsets.keys(): raise ConsumerError("cannot resume unassigned partitions") self.__consumer.resume([ ConfluentTopicPartition(partition.topic.name, partition.index) for partition in partitions ]) for partition in partitions: self.__paused.discard(partition) def paused(self) -> Sequence[Partition]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) return [*self.__paused] def stage_offsets(self, offsets: Mapping[Partition, int]) -> None: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if offsets.keys() - self.__offsets.keys(): raise ConsumerError( "cannot stage offsets for unassigned partitions") self.__validate_offsets(offsets) # TODO: Maybe log a warning if these offsets exceed the current # offsets, since that's probably a side effect of an incorrect usage # pattern? self.__staged_offsets.update(offsets) def __commit(self) -> Mapping[Partition, int]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) result: Optional[Sequence[ConfluentTopicPartition]] if self.__staged_offsets: result = self.__consumer.commit( offsets=[ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in self.__staged_offsets.items() ], asynchronous=False, ) else: result = [] assert result is not None # synchronous commit should return result immediately self.__staged_offsets.clear() offsets: MutableMapping[Partition, int] = {} for value in result: # The Confluent Kafka Consumer will include logical offsets in the # sequence of ``Partition`` objects returned by ``commit``. These # are an implementation detail of the Kafka Consumer, so we don't # expose them here. # NOTE: These should no longer be seen now that we are forcing # offsets to be set as part of the assignment callback. if value.offset in self.LOGICAL_OFFSETS: continue assert value.offset >= 0, "expected non-negative offset" offsets[Partition(Topic(value.topic), value.partition)] = value.offset return offsets def commit_offsets(self) -> Mapping[Partition, int]: """ Commit staged offsets for all partitions that this consumer is assigned to. The return value of this method is a mapping of partitions with their committed offsets as values. Raises an ``InvalidState`` if called on a closed consumer. """ return self.__commit_retry_policy.call(self.__commit) def close(self, timeout: Optional[float] = None) -> None: """ Close the consumer. This stops consuming messages, *may* commit staged offsets (depending on the configuration), and ends its subscription. Raises a ``InvalidState`` if the consumer is unable to be closed before the timeout is reached. """ try: self.__consumer.close() except RuntimeError: pass self.__state = KafkaConsumerState.CLOSED @property def closed(self) -> bool: return self.__state is KafkaConsumerState.CLOSED
class ConsoleConsumer: def __init__(self, brokers, topic, offset, key_decoder, value_decoder, registry_url, additional_properties): config = { 'bootstrap.servers': brokers, 'enable.partition.eof': 'true', 'group.id': 'not-used', 'auto.offset.reset': 'earliest', 'enable.auto.commit': 'false' } self.consumer = Consumer({**additional_properties, **config}) self.topic = topic self.offset = offset.lower() self.key_decoder = key_decoder.lower() self.value_decoder = value_decoder.lower() self.avro_serializer = None if registry_url: client = CachedSchemaRegistryClient(registry_url) self.avro_serializer = MessageSerializer(client) def run(self): try: partition_ends = 0 total_parts, partitions = self._partitions() self.consumer.assign(partitions) while True: msg = self.consumer.poll(timeout=0.5) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: eprint( f'{msg.topic()} reached end of partition [{msg.partition()}] at offset {msg.offset()}' ) partition_ends += 1 if partition_ends == total_parts: break elif msg.error(): raise KafkaException(msg.error()) else: record = { 'key': self._decode(self.key_decoder, msg.key()), 'payload': self._decode(self.value_decoder, msg.value()), 'topic': msg.topic(), 'partition': msg.partition(), 'offset': msg.offset(), 'timestamp': msg.timestamp()[1] } print(json.dumps(record)) finally: self.consumer.close() def _partitions(self): parts = [] topic_data = self.consumer.list_topics(topic=self.topic) total_parts = len(topic_data.topics[self.topic].partitions) for i in range(0, total_parts): partition = TopicPartition(self.topic, i, offset=OFFSET_BEGINNING) if self.offset == 'earliest': parts.append(partition) else: try: start, end = self.consumer.get_watermark_offsets( partition, timeout=0.5) real_offset = int(self.offset) ass_offset = (end + real_offset) if ( real_offset < 0) else (start + real_offset) parts.append( TopicPartition(self.topic, i, offset=ass_offset)) except ValueError: eprint(f"Could not parse offset: {self.offset}") exit(1) return total_parts, parts def _decode(self, data_type, payload): if data_type == "avro": return self.avro_serializer.decode_message(payload) payload_str = payload.decode('utf-8') try: return json.loads(payload_str) except (JSONDecodeError, TypeError): return payload_str
class KafkaConsumer(Consumer[TopicPartition, int, bytes]): """ The behavior of this consumer differs slightly from the Confluent consumer during rebalancing operations. Whenever a partition is assigned to this consumer, offsets are *always* automatically reset to the committed offset for that partition (or if no offsets have been committed for that partition, the offset is reset in accordance with the ``auto.offset.reset`` configuration value.) This causes partitions that are maintained across a rebalance to have the same offset management behavior as a partition that is moved from one consumer to another. To prevent uncommitted messages from being consumed multiple times, ``commit`` should be called in the partition revocation callback. The behavior of ``auto.offset.reset`` also differs slightly from the Confluent consumer as well: offsets are only reset during initial assignment or subsequent rebalancing operations. Any other circumstances that would otherwise lead to preemptive offset reset (e.g. the consumer tries to read a message that is before the earliest offset, or the consumer attempts to read a message that is after the latest offset) will cause an exception to be thrown, rather than resetting the offset, as this could lead to chunks messages being replayed or skipped, depending on the circumstances. This also means that if the committed offset is no longer available (such as when reading older messages from the log and those messages expire, or reading newer messages from the log and the leader crashes and partition ownership fails over to an out-of-date replica), the consumer will fail-stop rather than reset to the value of ``auto.offset.reset``. """ # Set of logical offsets that do not correspond to actual log positions. # These offsets should be considered an implementation detail of the Kafka # consumer and not used publically. # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25 LOGICAL_OFFSETS = frozenset( [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID]) def __init__(self, configuration: Mapping[str, Any]) -> None: auto_offset_reset = configuration.get("auto.offset.reset", "largest") if auto_offset_reset in {"smallest", "earliest", "beginning"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_earliest) elif auto_offset_reset in {"largest", "latest", "end"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_latest) elif auto_offset_reset == "error": self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_error) else: raise ValueError( "invalid value for 'auto.offset.reset' configuration") # NOTE: Offsets are explicitly managed as part of the assignment # callback, so preemptively resetting offsets is not enabled. self.__consumer = ConfluentConsumer({ **configuration, "auto.offset.reset": "error" }) self.__offsets: MutableMapping[TopicPartition, int] = {} self.__state = KafkaConsumerState.CONSUMING def __resolve_partition_offset_earliest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, low) def __resolve_partition_offset_latest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, high) def __resolve_partition_offset_error( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: raise ConsumerError("unable to resolve partition offsets") def subscribe( self, topics: Sequence[str], on_assign: Optional[Callable[[Sequence[TopicPartition]], None]] = None, on_revoke: Optional[Callable[[Sequence[TopicPartition]], None]] = None, ) -> None: if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) def assignment_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.ASSIGNING try: assignment: MutableSequence[ConfluentTopicPartition] = [] for partition in self.__consumer.committed(partitions): if partition.offset >= 0: assignment.append(partition) elif partition.offset == OFFSET_INVALID: assignment.append( self.__resolve_partition_starting_offset( partition)) else: raise ValueError("received unexpected offset") offsets: MutableMapping[TopicPartition, int] = { TopicPartition(i.topic, i.partition): i.offset for i in assignment } self.__seek(offsets) except Exception: self.__state = KafkaConsumerState.ERROR raise try: if on_assign is not None: on_assign(list(offsets.keys())) finally: self.__state = KafkaConsumerState.CONSUMING def revocation_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.REVOKING streams = [ TopicPartition(i.topic, i.partition) for i in partitions ] try: if on_revoke is not None: on_revoke(streams) finally: for stream in streams: try: self.__offsets.pop(stream) except KeyError: # If there was an error during assignment, this stream # may have never been added to the offsets mapping. logger.warning( "failed to delete offset for unknown stream: %r", stream) self.__state = KafkaConsumerState.CONSUMING self.__consumer.subscribe(topics, on_assign=assignment_callback, on_revoke=revocation_callback) def unsubscribe(self) -> None: if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) self.__consumer.unsubscribe() def poll(self, timeout: Optional[float] = None) -> Optional[KafkaMessage]: if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) message: Optional[ConfluentMessage] = self.__consumer.poll( *[timeout] if timeout is not None else []) if message is None: return None error: Optional[KafkaError] = message.error() if error is not None: code = error.code() if code == KafkaError._PARTITION_EOF: raise EndOfStream( TopicPartition(message.topic(), message.partition()), message.offset(), ) elif code == KafkaError._TRANSPORT: raise TransportError(str(error)) else: raise ConsumerError(str(error)) result = KafkaMessage( TopicPartition(message.topic(), message.partition()), message.offset(), message.value(), ) self.__offsets[result.stream] = result.get_next_offset() return result def tell(self) -> Mapping[TopicPartition, int]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) return self.__offsets def __seek(self, offsets: Mapping[TopicPartition, int]) -> None: if self.__state is KafkaConsumerState.ASSIGNING: # Calling ``seek`` on the Confluent consumer from an assignment # callback will throw an "Erroneous state" error. Instead, # partition offsets have to be initialized by calling ``assign``. self.__consumer.assign([ ConfluentTopicPartition(stream.topic, stream.partition, offset) for stream, offset in offsets.items() ]) else: for stream, offset in offsets.items(): self.__consumer.seek( ConfluentTopicPartition(stream.topic, stream.partition, offset)) self.__offsets.update(offsets) def seek(self, offsets: Mapping[TopicPartition, int]) -> None: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if offsets.keys() - self.__offsets.keys(): raise ConsumerError("cannot seek on unassigned streams") self.__seek(offsets) def commit(self) -> Mapping[TopicPartition, int]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) result: Optional[Sequence[ConfluentTopicPartition]] = None retries_remaining = 3 while result is None: try: result = self.__consumer.commit(asynchronous=False) assert result is not None except KafkaException as e: if not e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR_FOR_GROUP, KafkaError._WAIT_COORD, ): raise if not retries_remaining: raise logger.warning( "Commit failed: %s (%d retries remaining)", str(e), retries_remaining, ) retries_remaining -= 1 time.sleep(1) offsets: MutableMapping[TopicPartition, int] = {} for value in result: # The Confluent Kafka Consumer will include logical offsets in the # sequence of ``TopicPartition`` objects returned by ``commit``. # These are an implementation detail of the Kafka Consumer, so we # don't expose them here. # NOTE: These should no longer be seen now that we are forcing # offsets to be set as part of the assignment callback. if value.offset in self.LOGICAL_OFFSETS: continue assert value.offset >= 0, "expected non-negative offset" offsets[TopicPartition(value.topic, value.partition)] = value.offset return offsets def close(self, timeout: Optional[float] = None) -> None: try: self.__consumer.close() except RuntimeError: pass self.__state = KafkaConsumerState.CLOSED