class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self._kafka_groups = defaultdict(lambda: defaultdict(dict)) self.active_partitions = {} self._finished = False def read_group(self, group_id): partition_count = get_offset_topic_partition_count(self.kafka_config) partition = get_group_partition(group_id, partition_count) return self.read_groups(partition)[group_id] def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) # Fetch metadata as partitions_for_topic only returns locally cached metadata # See https://github.com/dpkp/kafka-python/issues/1742 self.consumer.topics() if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic( CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks( list(self.active_partitions.values())) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[ tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(list(self.active_partitions.values())) self.log.info("Consuming from %s", self.active_partitions) message_iterator = iter(self.consumer) while not self.finished(): try: message = next(message_iterator) except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[ message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) self._remove_unsubscribed_topics() return { group: topics.keys() for group, topics in six.iteritems(self._kafka_groups) if topics } def _remove_unsubscribed_topics(self): for group, topics in list(six.iteritems(self._kafka_groups)): for topic, partitions in list(six.iteritems(topics)): # If offsets for all partitions are 0, consider the topic as unsubscribed if not any(partitions.values()): del self._kafka_groups[group][topic] self.log.info( "Removed group {group} topic {topic} from list of groups" .format(group=group, topic=topic)) def remove_partition_from_consumer(self, partition): deleted = self.active_partitions.pop(partition) # Terminate if there are no more partitions to consume if not self.active_partitions: self.log.info("Completed reading from all partitions") self._finished = True return # Reassign the remaining partitions to the consumer while saving the # position positions = [(p, self.consumer.position(p)) for p in self.active_partitions.values()] self.consumer.assign(list(self.active_partitions.values())) for topic_partition, position in positions: self.consumer.seek(topic_partition, position) self.log.info( "Completed reading from %s. Remaining partitions: %s", deleted, self.active_partitions, ) def parse_consumer_offset_message(self, message): key = message.key ((key_schema, ), cur) = relative_unpack(b'>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException( ) # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition, ), cur) = relative_unpack(b'>l', key, cur) if message.value: value = message.value ((value_schema, ), cur) = relative_unpack(b'>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset, ), cur) = relative_unpack(b'>q', value, cur) else: offset = None # Offset was deleted return group.decode(), topic.decode(), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message( message) except InvalidMessageException: return if offset is not None: self._kafka_groups[group][topic][partition] = offset self.log.info( "Updated group {group} topic {topic} and updated offset in list of groups" .format( group=group, topic=topic, ), ) # TODO: check if we can ever find an offset commit message with message.value is None elif offset is None and group in self._kafka_groups and \ topic in self._kafka_groups[group]: # No offset means topic deletion del self._kafka_groups[group][topic] self.log.info( "Removed group {group} topic {topic} from list of groups". format(group=group, topic=topic)) def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return { part: offset for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC]) if offset.highmark > offset.lowmark and ( partitions is None or part in partitions_set) } def finished(self): return self._finished
def getMsgData(topic, group, result, maxsize): try: saveResult = SaveDataResult() saveResult.guid = str(uuid.uuid4()) saveResult.CreateDate = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") msgInfos = [] result.guid = saveResult.guid result.topic_messages = [] consumer = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers, enable_auto_commit=False, group_id=group) # Get all partitions by topic par = consumer.partitions_for_topic(topic) now_count = 0 for p in par: tp = TopicPartition(topic, p) consumer.assign([tp]) print(tp) info = MsgPartitionInfo() # Get committed offset print('start to get committed offset.....') try: committed = consumer.committed(tp) or 0 except Exception, e_commit: print(str(e_commit)) # Move consumer to end to get the last position consumer.seek_to_end(tp) last_offset = consumer.position(tp) # Move consumer to beginning to get the first position consumer.seek_to_beginning() now_offset = consumer.position(tp) from_offset = committed if from_offset is None: from_offset = now_offset if from_offset < now_offset: from_offset = now_offset info.partition_ID = tp.partition info.get_last_offset = last_offset msgInfos.append(info) print("[%s] partition(%s) -> now:%s, last:%s, committed:%s" % (tp.topic, tp.partition, now_offset, last_offset, committed)) # Get msg from position to offset while (from_offset < last_offset) and (now_count < maxsize): consumer.seek(tp, from_offset) polldata = consumer.poll(100) from_offset += 1 now_count += 1 print('now_count=' + str(now_count)) result.topic_messages.append(polldata[tp][0].value) saveResult.MsgInfo = json.dumps(msgInfos, default=encode_MsgPartitionInfo, ensure_ascii=False) print(saveResult.MsgInfo) consumer.close() saveResult.message = "Success" saveResult.Code = 200 producer = KafkaProducer(bootstrap_servers=tmpbootstrap_servers) producer.send(topic + "_log", json.dumps(saveResult, default=encode_SaveDataResult)) producer.flush()
class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self.kafka_groups = defaultdict(set) self.active_partitions = {} self._finished = False def read_group(self, group_id): partition_count = get_offset_topic_partition_count(self.kafka_config) partition = get_group_partition(group_id, partition_count) return self.read_groups(partition).get(group_id, []) def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic( CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks( self.active_partitions.values()) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[ tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(self.active_partitions.values()) self.log.info("Consuming from %s", self.active_partitions) while not self.finished(): try: message = self.consumer.next() except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[ message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) return { group: topics for group, topics in self.kafka_groups.items() if topics } def remove_partition_from_consumer(self, partition): deleted = self.active_partitions.pop(partition) # Terminate if there are no more partitions to consume if not self.active_partitions: self.log.info("Completed reading from all partitions") self._finished = True return # Reassign the remaining partitions to the consumer while saving the # position positions = [(p, self.consumer.position(p)) for p in self.active_partitions.values()] self.consumer.assign(self.active_partitions.values()) for topic_partition, position in positions: self.consumer.seek(topic_partition, position) self.log.info( "Completed reading from %s. Remaining partitions: %s", deleted, self.active_partitions, ) def parse_consumer_offset_message(self, message): key = bytearray(message.key) ((key_schema, ), cur) = relative_unpack(b'>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException( ) # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition, ), cur) = relative_unpack(b'>l', key, cur) if message.value: value = bytearray(message.value) ((value_schema, ), cur) = relative_unpack(b'>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset, ), cur) = relative_unpack(b'>q', value, cur) else: offset = None # Offset was deleted return str(group), str(topic), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message( message) except InvalidMessageException: return if offset and (group not in self.kafka_groups or topic not in self.kafka_groups[group]): self.kafka_groups[group].add(topic) self.log.info("Added group %s topic %s to list of groups", group, topic) elif not offset and group in self.kafka_groups and \ topic in self.kafka_groups[group]: # No offset means topic deletion self.kafka_groups[group].discard(topic) self.log.info("Removed group %s topic %s from list of groups", group, topic) def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return { part: offset for part, offset in offsets[CONSUMER_OFFSET_TOPIC].iteritems() if offset.highmark > offset.lowmark and ( partitions is None or part in partitions_set) } def finished(self): return self._finished
class IBUSStreamingDownsamplingConsumer: LOG_FORMAT ="{} UTC_TS\t"\ "{}" def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic, logTopic, interval): self.kafkaHost = kafkaHost self.kafkaPort = kafkaPort self.tcpHost = tcpHost self.tcpPort = tcpPort self.group_id = group_id self.topic = topic self.logTopic = logTopic self.interval = int(interval) self.consumer = KafkaConsumer( topic, bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)], group_id=group_id, enable_auto_commit=False) self.producer = KafkaProducer( bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)]) self.tcpWriter = None def getTopicPartitions(self): self.consumer.topics() #This ensures local cache is updated with # information about partitions, offsets etc. pids = self.consumer.partitions_for_topic(self.topic) tps = [TopicPartition(self.topic, pid) for pid in pids] return tps def getTopicPartitionsCommittedPositions(self): tps = self.getTopicPartitions() ret = [(tp, self.consumer.committed(tp)) for tp in tps] return ret async def tcp_server_handler(self, reader, writer): addr = str(writer.get_extra_info("socket").getpeername()) if self.tcpWriter is not None: self.log("refused " + addr) writer.write(b"Connection limit reached; connection refused.") writer.close() return self.log("accepted " + addr) self.tcpWriter = writer t1 = asyncio.create_task(self.poll_from_Kafka(writer)) try: while True: data = await reader.read(1) # 1024*16 bytes if not data: break except BrokenPipeError: """ Catches connecton reset by peer when we are sending the batched data, which is also when we cannot check for reader. The broken connection on the writer side will ultimately lead to BrokenPipeError on the reader side. Hence """ pass finally: t1.cancel() self.log("closed " + addr) writer.close() self.tcpWriter = None async def poll_from_Kafka(self, writer): while True: prevPos = self.getTopicPartitionsCommittedPositions() polled = self.consumer.poll(timeout_ms=1000) records = [ record.value for recordList in polled.values() for record in recordList ] try: for record in records: writer.write(record) await writer.drain() except ConnectionResetError: """ The error is not thrown reliably. If a connection is broken, and one try to writer.write(record) await writer.drain() This error may not manifest. It is thrown more often when one try to repeatedly write to and drain a broken connection. """ print("Last batch not fully sent, not commited.") for tp, pos in prevPos: self.consumer.seek(tp, pos) break else: self.consumer.commit() await asyncio.sleep(self.interval) def log(self, msg): self.producer.send( self.logTopic, self.LOG_FORMAT.format( datetime.now().timestamp(), msg ) \ .encode() ) def cleanup(self): self.log("shutdown") self.consumer.close() self.producer.flush() self.producer.close() def run(self): self.log("running") asyncio.run(self._async_run()) async def _async_run(self): tcpServer = await asyncio.start_server(self.tcp_server_handler, self.tcpHost, self.tcpPort) await tcpServer.serve_forever()
def consume(args): settings = Global.settings writer = Global.writer reader = Global.reader schema = args.schema skip_error = args.skip_error auto_offset_reset = args.auto_offset_reset offset = args.offset topic = settings.kafka_topic tables_pk = {} schema_table = settings.schema_table.get(schema) tables = schema_table.get("tables") for table in tables: tables_pk[table] = reader.get_primary_key(schema, table) consumer = KafkaConsumer( bootstrap_servers=settings.kafka_server, value_deserializer=lambda x: json.loads(x, object_hook=object_hook), key_deserializer=lambda x: x.decode() if x else None, enable_auto_commit=False, group_id=schema, auto_offset_reset=auto_offset_reset, ) partition = schema_table.get("kafka_partition") topic_partition = TopicPartition(topic, partition) consumer.assign([topic_partition]) if offset: consumer.seek(topic_partition, offset) event_list = {} is_insert = False last_time = 0 len_event = 0 logger.info( f"success consume topic:{topic},partitions:{partition},schema:{schema},tables:{tables}" ) for msg in consumer: # type:ConsumerRecord logger.debug(f"kafka msg:{msg}") event = msg.value event_unixtime = event["event_unixtime"] / 10**6 table = event["table"] schema = event["schema"] action = event["action"] if action == "query": alter_table = True query = event["values"]["query"] else: alter_table = False query = None event_list.setdefault(table, []).append(event) len_event += 1 if last_time == 0: last_time = event_unixtime if len_event == settings.insert_num: is_insert = True else: if event_unixtime - last_time >= settings.insert_interval > 0: is_insert = True if is_insert or alter_table: data_dict = {} events_num = 0 for table, items in event_list.items(): for item in items: action = item["action"] action_core = item["action_core"] data_dict.setdefault(table, {}).setdefault( table + schema + action + action_core, []).append(item) for table, v in data_dict.items(): tmp_data = [] for k1, v1 in v.items(): events_num += len(v1) tmp_data.append(v1) try: result = writer.insert_event(tmp_data, schema, table, tables_pk.get(table)) if not result: logger.error("insert event error!") if not skip_error: exit() except Exception as e: logger.error(f"insert event error!,error:{e}") if not skip_error: exit() if alter_table: try: logger.info(f"execute query:{query}") writer.execute(query) except Exception as e: logger.error(f"execute query error!,error:{e}") if not skip_error: exit() consumer.commit() logger.info(f"commit success {events_num} events!") event_list = {} is_insert = False len_event = last_time = 0
auto_offset_reset='smallest', #largest enable_auto_commit= False, ## true时,Consumer会在消费消息后将offset同步到zookeeper,这样当Consumer失败后,新的consumer就能从zookeeper获取最新的offset bootstrap_servers=_BROKERS) # consumer = KafkaConsumer(bootstrap_servers=_BROKERS) consumer.assign([TopicPartition(_TOPIC_NAME, 0)]) tp = TopicPartition(_TOPIC_NAME, 0) print(consumer.committed(TopicPartition(_TOPIC_NAME, 0))) # consumer.subscribe(topics=[_TOPIC_NAME]) # # Subscribe to a regex topic pattern # consumer.subscribe(pattern='^awesome.*') print(consumer.topics()) # partition = TopicPartition(topic=_TOPIC_NAME, partition=consumer.partitions_for_topic(_TOPIC_NAME)) # consumer.seek_to_beginning() # consumer.seek(TopicPartition(_TOPIC_NAME, 0), 0) consumer.seek(tp, 50) # 10 stands for start consumer from 10th offset a = [] for m in consumer: if len(a) < 5: print(m.offset) a.append(m.offset) # consumer.commit() # else: # a =[] ProduceRequestPayload = namedtuple("ProduceRequestPayload", ["topic", "partition", "messages"]) ProduceResponsePayload = namedtuple("ProduceResponsePayload", ["topic", "partition", "error", "offset"])
class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self._kafka_groups = defaultdict(lambda: defaultdict(dict)) self.active_partitions = {} self._finished = False def read_group(self, group_id): partition_count = get_offset_topic_partition_count(self.kafka_config) partition = get_group_partition(group_id, partition_count) return self.read_groups(partition).get(group_id, []) def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) # Fetch metadata as partitions_for_topic only returns locally cached metadata # See https://github.com/dpkp/kafka-python/issues/1742 self.consumer.topics() if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic(CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks(list(self.active_partitions.values())) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(list(self.active_partitions.values())) self.log.info("Consuming from %s", self.active_partitions) message_iterator = iter(self.consumer) while not self.finished(): try: message = next(message_iterator) except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) self._remove_unsubscribed_topics() return { group: topics.keys() for group, topics in six.iteritems(self._kafka_groups) if topics } def _remove_unsubscribed_topics(self): for group, topics in list(six.iteritems(self._kafka_groups)): for topic, partitions in list(six.iteritems(topics)): # If offsets for all partitions are 0, consider the topic as unsubscribed if not any(partitions.values()): del self._kafka_groups[group][topic] self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic)) def remove_partition_from_consumer(self, partition): deleted = self.active_partitions.pop(partition) # Terminate if there are no more partitions to consume if not self.active_partitions: self.log.info("Completed reading from all partitions") self._finished = True return # Reassign the remaining partitions to the consumer while saving the # position positions = [ (p, self.consumer.position(p)) for p in self.active_partitions.values() ] self.consumer.assign(list(self.active_partitions.values())) for topic_partition, position in positions: self.consumer.seek(topic_partition, position) self.log.info( "Completed reading from %s. Remaining partitions: %s", deleted, self.active_partitions, ) def parse_consumer_offset_message(self, message): key = message.key ((key_schema,), cur) = relative_unpack(b'>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException() # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition,), cur) = relative_unpack(b'>l', key, cur) if message.value: value = message.value ((value_schema,), cur) = relative_unpack(b'>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset,), cur) = relative_unpack(b'>q', value, cur) else: offset = None # Offset was deleted return group.decode(), topic.decode(), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message(message) except InvalidMessageException: return if offset is not None: self._kafka_groups[group][topic][partition] = offset self.log.info( "Updated group {group} topic {topic} and updated offset in list of groups".format( group=group, topic=topic, ), ) # TODO: check if we can ever find an offset commit message with message.value is None elif offset is None and group in self._kafka_groups and \ topic in self._kafka_groups[group]: # No offset means topic deletion del self._kafka_groups[group][topic] self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic)) def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return {part: offset for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC]) if offset.highmark > offset.lowmark and (partitions is None or part in partitions_set)} def finished(self): return self._finished