class IBUSStreamingDownsamplingConsumer: LOG_FORMAT ="{} UTC_TS\t"\ "{}" def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic, logTopic, interval): self.kafkaHost = kafkaHost self.kafkaPort = kafkaPort self.tcpHost = tcpHost self.tcpPort = tcpPort self.group_id = group_id self.topic = topic self.logTopic = logTopic self.interval = int(interval) self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)], group_id=group_id, enable_auto_commit=False) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)]) self.tcpWriter = None def getTopicPartitions(self): self.consumer.topics() #This ensures local cache is updated with # information about partitions, offsets etc. pids = self.consumer.partitions_for_topic(self.topic) tps = [TopicPartition(self.topic, pid) for pid in pids] return tps def getTopicPartitionsCommittedPositions(self): tps = self.getTopicPartitions() ret = [(tp, self.consumer.committed(tp)) for tp in tps] return ret async def tcp_server_handler(self, reader, writer): addr = str(writer.get_extra_info("socket").getpeername()) if self.tcpWriter is not None: self.log("refused " + addr) writer.write(b"Connection limit reached; connection refused.") writer.close() return self.log("accepted " + addr) self.tcpWriter = writer t1 = asyncio.create_task(self.poll_from_Kafka(writer)) try: while True: data = await reader.read(1) # 1024*16 bytes if not data: break except BrokenPipeError: """ Catches connecton reset by peer when we are sending the batched data, which is also when we cannot check for reader. The broken connection on the writer side will ultimately lead to BrokenPipeError on the reader side. Hence """ pass finally: t1.cancel() self.log("closed " + addr) writer.close() self.tcpWriter = None async def poll_from_Kafka(self, writer): while True: prevPos = self.getTopicPartitionsCommittedPositions() polled = self.consumer.poll(timeout_ms=1000) records = [ record.value for recordList in polled.values() for record in recordList ] try: for record in records: writer.write(record) await writer.drain() except ConnectionResetError: """ The error is not thrown reliably. If a connection is broken, and one try to writer.write(record) await writer.drain() This error may not manifest. It is thrown more often when one try to repeatedly write to and drain a broken connection. """ print("Last batch not fully sent, not commited.") for tp, pos in prevPos: self.consumer.seek(tp, pos) break else: self.consumer.commit() await asyncio.sleep(self.interval) def log(self, msg): self.producer.send( self.logTopic, self.LOG_FORMAT.format( datetime.now().timestamp(), msg ) \ .encode() ) def cleanup(self): self.log("shutdown") self.consumer.close() self.producer.flush() self.producer.close() def run(self): self.log("running") asyncio.run(self._async_run()) async def _async_run(self): tcpServer = await asyncio.start_server(self.tcp_server_handler, self.tcpHost, self.tcpPort) await tcpServer.serve_forever()
def getMsgData(topic, group, result, maxsize): try: saveResult = SaveDataResult() saveResult.guid = str(uuid.uuid4()) saveResult.CreateDate = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") msgInfos = [] result.guid = saveResult.guid result.topic_messages = [] consumer = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers, enable_auto_commit=False, group_id=group) # Get all partitions by topic par = consumer.partitions_for_topic(topic) now_count = 0 for p in par: tp = TopicPartition(topic, p) consumer.assign([tp]) print(tp) info = MsgPartitionInfo() # Get committed offset print('start to get committed offset.....') try: committed = consumer.committed(tp) or 0 except Exception, e_commit: print(str(e_commit)) # Move consumer to end to get the last position consumer.seek_to_end(tp) last_offset = consumer.position(tp) # Move consumer to beginning to get the first position consumer.seek_to_beginning() now_offset = consumer.position(tp) from_offset = committed if from_offset is None: from_offset = now_offset if from_offset < now_offset: from_offset = now_offset info.partition_ID = tp.partition info.get_last_offset = last_offset msgInfos.append(info) print("[%s] partition(%s) -> now:%s, last:%s, committed:%s" % (tp.topic, tp.partition, now_offset, last_offset, committed)) # Get msg from position to offset while (from_offset < last_offset) and (now_count < maxsize): consumer.seek(tp, from_offset) polldata = consumer.poll(100) from_offset += 1 now_count += 1 print('now_count=' + str(now_count)) result.topic_messages.append(polldata[tp][0].value) saveResult.MsgInfo = json.dumps(msgInfos, default=encode_MsgPartitionInfo, ensure_ascii=False) print(saveResult.MsgInfo) consumer.close() saveResult.message = "Success" saveResult.Code = 200 producer = KafkaProducer(bootstrap_servers=tmpbootstrap_servers) producer.send(topic + "_log", json.dumps(saveResult, default=encode_SaveDataResult)) producer.flush()
# https://github.com/cuyu/python-demo/blob/master/demo_kafka.py _TOPIC_NAME = 'anomaly' _BROKERS = ['localhost:9092' ] #['localhost.com:9092', 'systest-auto-deployer:9092'] _GROUP_ID = 'my_group' consumer = KafkaConsumer( group_id='ddd', auto_offset_reset='smallest', #largest enable_auto_commit= False, ## true时,Consumer会在消费消息后将offset同步到zookeeper,这样当Consumer失败后,新的consumer就能从zookeeper获取最新的offset bootstrap_servers=_BROKERS) # consumer = KafkaConsumer(bootstrap_servers=_BROKERS) consumer.assign([TopicPartition(_TOPIC_NAME, 0)]) tp = TopicPartition(_TOPIC_NAME, 0) print(consumer.committed(TopicPartition(_TOPIC_NAME, 0))) # consumer.subscribe(topics=[_TOPIC_NAME]) # # Subscribe to a regex topic pattern # consumer.subscribe(pattern='^awesome.*') print(consumer.topics()) # partition = TopicPartition(topic=_TOPIC_NAME, partition=consumer.partitions_for_topic(_TOPIC_NAME)) # consumer.seek_to_beginning() # consumer.seek(TopicPartition(_TOPIC_NAME, 0), 0) consumer.seek(tp, 50) # 10 stands for start consumer from 10th offset a = [] for m in consumer: if len(a) < 5: print(m.offset) a.append(m.offset) # consumer.commit() # else: