示例#1
0
def consume(args):
    schema = args.schema
    table = args.table
    assert schema in settings.SCHEMAS, 'schema must in settings.SCHEMAS'
    assert table in settings.TABLES, 'table must in settings.TABLES'
    group_id = f'{schema}.{table}'
    consumer = KafkaConsumer(
        bootstrap_servers=settings.KAFKA_SERVER,
        value_deserializer=lambda x: json.loads(x, object_hook=object_hook),
        key_deserializer=lambda x: x.decode() if x else None,
        enable_auto_commit=False,
        group_id=group_id,
        auto_offset_reset='earliest',
    )
    topic = settings.KAFKA_TOPIC
    partition = settings.PARTITIONS.get(group_id)
    consumer.assign([TopicPartition(topic, partition)])
    event_list = []
    logger.info(
        f'success consume topic:{topic},partition:{partition},schema:{schema},table:{table}'
    )
    pk = reader.get_primary_key(schema, table)
    for msg in consumer:  # type:ConsumerRecord
        logger.debug(f'kafka msg:{msg}')
        event = msg.value
        event_list.append(event)
        len_event = len(event_list)
        if len_event == settings.INSERT_NUMS or (
            (int(time.time() * 10**6) - event_list[0]['event_unixtime']) /
                10**6 >= settings.INSERT_INTERVAL > 0):
            data_dict = {}
            tmp_data = []
            for items in event_list:
                action = items['action']
                action_core = items['action_core']
                data_dict.setdefault(table + schema + action + action_core,
                                     []).append(items)
            for k, v in data_dict.items():
                tmp_data.append(v)
            result = writer.insert_event(tmp_data, settings.SKIP_TYPE,
                                         settings.SKIP_DELETE_TB_NAME, schema,
                                         table, pk)
            if result:
                event_list = []
                consumer.commit()
                logger.info(f'commit success {len_event} events!')
            else:
                logger.error('insert event error!')
                exit()
示例#2
0
def consume(args):
    schema = args.schema
    tables = args.tables
    skip_error = args.skip_error
    assert schema in settings.SCHEMAS, f'schema {schema} must in settings.SCHEMAS'
    topic = settings.KAFKA_TOPIC
    tables_pk = {}
    partitions = []
    for table in tables.split(','):
        assert table in settings.TABLES, f'table {table} must in settings.TABLES'

        partition = settings.PARTITIONS.get(f'{schema}.{table}')
        tp = TopicPartition(topic, partition)
        partitions.append(tp)
        tables_pk[table] = reader.get_primary_key(schema, table)

    group_id = f'{schema}.{tables}'
    consumer = KafkaConsumer(
        bootstrap_servers=settings.KAFKA_SERVER,
        value_deserializer=lambda x: json.loads(x, object_hook=object_hook),
        key_deserializer=lambda x: x.decode() if x else None,
        enable_auto_commit=False,
        group_id=group_id,
        auto_offset_reset='earliest',
    )
    consumer.assign(partitions)

    event_list = {}
    is_insert = False
    last_time = 0
    len_event = 0
    logger.info(f'success consume topic:{topic},partitions:{partitions},schema:{schema},tables:{tables}')

    for msg in consumer:  # type:ConsumerRecord
        logger.debug(f'kafka msg:{msg}')
        event = msg.value
        event_unixtime = event['event_unixtime'] / 10 ** 6
        table = event['table']
        schema = event['schema']
        event_list.setdefault(table, []).append(event)
        len_event += 1

        if last_time == 0:
            last_time = event_unixtime

        if len_event == settings.INSERT_NUMS:
            is_insert = True
        else:
            if event_unixtime - last_time >= settings.INSERT_INTERVAL > 0:
                is_insert = True
        if is_insert:
            data_dict = {}
            events_num = 0
            for table, items in event_list.items():
                for item in items:
                    action = item['action']
                    action_core = item['action_core']
                    data_dict.setdefault(table, {}).setdefault(table + schema + action + action_core, []).append(item)
            for table, v in data_dict.items():
                tmp_data = []
                for k1, v1 in v.items():
                    events_num += len(v1)
                    tmp_data.append(v1)
                try:
                    result = writer.insert_event(tmp_data, schema, table, tables_pk.get(table))
                    if not result:
                        logger.error('insert event error!')
                        if not skip_error:
                            exit()
                except Exception as e:
                    logger.error(f'insert event error!,error:{e}')
                    if not skip_error:
                        exit()
            consumer.commit()
            logger.info(f'commit success {events_num} events!')
            event_list = {}
            is_insert = False
            len_event = last_time = 0
示例#3
0
def getMsgData(topic, group, result, maxsize):
    try:
        saveResult = SaveDataResult()
        saveResult.guid = str(uuid.uuid4())
        saveResult.CreateDate = datetime.datetime.now().strftime(
            "%Y-%m-%d %H:%M:%S")

        msgInfos = []
        result.guid = saveResult.guid
        result.topic_messages = []

        consumer = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers,
                                 enable_auto_commit=False,
                                 group_id=group)

        # Get all partitions by topic
        par = consumer.partitions_for_topic(topic)

        now_count = 0

        for p in par:
            tp = TopicPartition(topic, p)
            consumer.assign([tp])
            print(tp)
            info = MsgPartitionInfo()

            # Get committed offset
            print('start to get committed offset.....')
            try:
                committed = consumer.committed(tp) or 0
            except Exception, e_commit:
                print(str(e_commit))

            # Move consumer to end to get the last position
            consumer.seek_to_end(tp)
            last_offset = consumer.position(tp)

            # Move consumer to beginning to get the first position
            consumer.seek_to_beginning()
            now_offset = consumer.position(tp)
            from_offset = committed

            if from_offset is None:
                from_offset = now_offset

            if from_offset < now_offset:
                from_offset = now_offset

            info.partition_ID = tp.partition
            info.get_last_offset = last_offset
            msgInfos.append(info)

            print("[%s] partition(%s) -> now:%s,  last:%s,  committed:%s" %
                  (tp.topic, tp.partition, now_offset, last_offset, committed))

            # Get msg from position to offset
            while (from_offset < last_offset) and (now_count < maxsize):
                consumer.seek(tp, from_offset)
                polldata = consumer.poll(100)
                from_offset += 1
                now_count += 1
                print('now_count=' + str(now_count))
                result.topic_messages.append(polldata[tp][0].value)

        saveResult.MsgInfo = json.dumps(msgInfos,
                                        default=encode_MsgPartitionInfo,
                                        ensure_ascii=False)
        print(saveResult.MsgInfo)
        consumer.close()
        saveResult.message = "Success"
        saveResult.Code = 200

        producer = KafkaProducer(bootstrap_servers=tmpbootstrap_servers)
        producer.send(topic + "_log",
                      json.dumps(saveResult, default=encode_SaveDataResult))
        producer.flush()
示例#4
0
def consume(args):
    schema = args.schema
    skip_error = args.skip_error
    auto_offset_reset = args.auto_offset_reset
    topic = settings.KAFKA_TOPIC
    tables_pk = {}
    tables = settings.SCHEMAS.get(schema)
    partitions = []
    for table in tables:
        partition = settings.PARTITIONS.get(schema)
        tp = TopicPartition(topic, partition)
        partitions.append(tp)
        tables_pk[table] = reader.get_primary_key(schema, table)

    consumer = KafkaConsumer(
        bootstrap_servers=settings.KAFKA_SERVER,
        value_deserializer=lambda x: json.loads(x, object_hook=object_hook),
        key_deserializer=lambda x: x.decode() if x else None,
        enable_auto_commit=False,
        group_id=schema,
        auto_offset_reset=auto_offset_reset,
    )
    consumer.assign(partitions)

    event_list = {}
    is_insert = False
    last_time = 0
    len_event = 0
    logger.info(
        f'success consume topic:{topic},partitions:{partitions},schema:{schema},tables:{tables}'
    )

    for msg in consumer:  # type:ConsumerRecord
        logger.debug(f'kafka msg:{msg}')
        event = msg.value
        event_unixtime = event['event_unixtime'] / 10**6
        table = event['table']
        schema = event['schema']
        action = event['action']

        if action == 'query':
            do_query = True
            query = event['values']['query']
        else:
            do_query = False
            query = None
            event_list.setdefault(table, []).append(event)
            len_event += 1

        if last_time == 0:
            last_time = event_unixtime

        if len_event == settings.INSERT_NUMS:
            is_insert = True
        else:
            if event_unixtime - last_time >= settings.INSERT_INTERVAL > 0:
                is_insert = True
        if is_insert or do_query:
            data_dict = {}
            events_num = 0
            for table, items in event_list.items():
                for item in items:
                    action = item['action']
                    action_core = item['action_core']
                    data_dict.setdefault(table, {}).setdefault(
                        table + schema + action + action_core, []).append(item)
            for table, v in data_dict.items():
                tmp_data = []
                for k1, v1 in v.items():
                    events_num += len(v1)
                    tmp_data.append(v1)
                try:
                    result = writer.insert_event(tmp_data, schema, table,
                                                 tables_pk.get(table))
                    if not result:
                        logger.error('insert event error!')
                        if not skip_error:
                            exit()

                    if settings.UI_ENABLE:
                        insert_into_redis('consumer', schema, table, len(v1))

                except Exception as e:
                    logger.error(f'insert event error!,error:{e}')
                    if not skip_error:
                        exit()
            if do_query:
                try:
                    logger.info(f'execute query:{query}')
                    writer.execute(query)
                except Exception as e:
                    logger.error(f'execute query error!,error:{e}')
                    if not skip_error:
                        exit()
            consumer.commit()
            logger.info(f'commit success {events_num} events!')
            event_list = {}
            is_insert = False
            len_event = last_time = 0
示例#5
0
class KafkaGroupReader:
    def __init__(self, kafka_config):
        self.log = logging.getLogger(__name__)
        self.kafka_config = kafka_config
        self._kafka_groups = defaultdict(lambda: defaultdict(dict))
        self.active_partitions = {}
        self._finished = False

    def read_group(self, group_id):
        partition_count = get_offset_topic_partition_count(self.kafka_config)
        partition = get_group_partition(group_id, partition_count)
        return self.read_groups(partition)[group_id]

    def read_groups(self, partition=None):
        self.consumer = KafkaConsumer(
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='earliest',
            enable_auto_commit=False,
            consumer_timeout_ms=30000,
            fetch_max_wait_ms=2000,
            max_partition_fetch_bytes=10 * 1024 * 1024,  # 10MB
        )

        # Fetch metadata as partitions_for_topic only returns locally cached metadata
        # See https://github.com/dpkp/kafka-python/issues/1742
        self.consumer.topics()

        if partition is not None:
            self.active_partitions = {
                partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition),
            }
        else:
            self.active_partitions = {
                p: TopicPartition(CONSUMER_OFFSET_TOPIC, p)
                for p in self.consumer.partitions_for_topic(
                    CONSUMER_OFFSET_TOPIC)
            }
        self.watermarks = self.get_current_watermarks(
            list(self.active_partitions.values()))
        # Active partitions are not empty. Remove the empty ones.
        self.active_partitions = {
            p: tp
            for p, tp in self.active_partitions.items()
            if tp.partition in self.watermarks
            and self.watermarks[tp.partition].highmark > 0 and self.watermarks[
                tp.partition].highmark > self.watermarks[tp.partition].lowmark
        }
        # Cannot consume if there are no active partitions
        if not self.active_partitions:
            return {}

        self.consumer.assign(list(self.active_partitions.values()))
        self.log.info("Consuming from %s", self.active_partitions)

        message_iterator = iter(self.consumer)

        while not self.finished():
            try:
                message = next(message_iterator)
            except StopIteration:
                continue
            # Stop when reaching the last message written to the
            # __consumer_offsets topic when KafkaGroupReader first started
            if message.offset >= self.watermarks[
                    message.partition].highmark - 1:
                self.remove_partition_from_consumer(message.partition)
            self.process_consumer_offset_message(message)

        self._remove_unsubscribed_topics()

        return {
            group: topics.keys()
            for group, topics in six.iteritems(self._kafka_groups) if topics
        }

    def _remove_unsubscribed_topics(self):
        for group, topics in list(six.iteritems(self._kafka_groups)):
            for topic, partitions in list(six.iteritems(topics)):
                # If offsets for all partitions are 0, consider the topic as unsubscribed
                if not any(partitions.values()):
                    del self._kafka_groups[group][topic]
                    self.log.info(
                        "Removed group {group} topic {topic} from list of groups"
                        .format(group=group, topic=topic))

    def remove_partition_from_consumer(self, partition):
        deleted = self.active_partitions.pop(partition)
        # Terminate if there are no more partitions to consume
        if not self.active_partitions:
            self.log.info("Completed reading from all partitions")
            self._finished = True
            return
        # Reassign the remaining partitions to the consumer while saving the
        # position
        positions = [(p, self.consumer.position(p))
                     for p in self.active_partitions.values()]
        self.consumer.assign(list(self.active_partitions.values()))
        for topic_partition, position in positions:
            self.consumer.seek(topic_partition, position)
        self.log.info(
            "Completed reading from %s. Remaining partitions: %s",
            deleted,
            self.active_partitions,
        )

    def parse_consumer_offset_message(self, message):
        key = message.key
        ((key_schema, ), cur) = relative_unpack(b'>h', key, 0)
        if key_schema not in [0, 1]:
            raise InvalidMessageException(
            )  # This is not an offset commit message
        (group, cur) = read_short_string(key, cur)
        (topic, cur) = read_short_string(key, cur)
        ((partition, ), cur) = relative_unpack(b'>l', key, cur)
        if message.value:
            value = message.value
            ((value_schema, ), cur) = relative_unpack(b'>h', value, 0)
            if value_schema not in [0, 1]:
                raise InvalidMessageException()  # Unrecognized message value
            ((offset, ), cur) = relative_unpack(b'>q', value, cur)
        else:
            offset = None  # Offset was deleted
        return group.decode(), topic.decode(), partition, offset

    def process_consumer_offset_message(self, message):
        try:
            group, topic, partition, offset = self.parse_consumer_offset_message(
                message)
        except InvalidMessageException:
            return

        if offset is not None:
            self._kafka_groups[group][topic][partition] = offset
            self.log.info(
                "Updated group {group} topic {topic} and updated offset in list of groups"
                .format(
                    group=group,
                    topic=topic,
                ), )
        # TODO: check if we can ever find an offset commit message with message.value is None
        elif offset is None and group in self._kafka_groups and \
                topic in self._kafka_groups[group]:  # No offset means topic deletion
            del self._kafka_groups[group][topic]
            self.log.info(
                "Removed group {group} topic {topic} from list of groups".
                format(group=group, topic=topic))

    def get_current_watermarks(self, partitions=None):
        client = KafkaToolClient(self.kafka_config.broker_list)
        client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC)
        offsets = get_topics_watermarks(
            client,
            [CONSUMER_OFFSET_TOPIC],
        )
        partitions_set = set(tp.partition
                             for tp in partitions) if partitions else None
        return {
            part: offset
            for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC])
            if offset.highmark > offset.lowmark and (
                partitions is None or part in partitions_set)
        }

    def finished(self):
        return self._finished
示例#6
0
class KafkaGroupReader:
    def __init__(self, kafka_config):
        self.log = logging.getLogger(__name__)
        self.kafka_config = kafka_config
        self.kafka_groups = defaultdict(set)
        self.active_partitions = {}
        self._finished = False

    def read_group(self, group_id):
        partition_count = get_offset_topic_partition_count(self.kafka_config)
        partition = get_group_partition(group_id, partition_count)
        return self.read_groups(partition).get(group_id, [])

    def read_groups(self, partition=None):
        self.consumer = KafkaConsumer(
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='earliest',
            enable_auto_commit=False,
            consumer_timeout_ms=30000,
            fetch_max_wait_ms=2000,
            max_partition_fetch_bytes=10 * 1024 * 1024,  # 10MB
        )

        if partition is not None:
            self.active_partitions = {
                partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition),
            }
        else:
            self.active_partitions = {
                p: TopicPartition(CONSUMER_OFFSET_TOPIC, p)
                for p in self.consumer.partitions_for_topic(
                    CONSUMER_OFFSET_TOPIC)
            }
        self.watermarks = self.get_current_watermarks(
            self.active_partitions.values())
        # Active partitions are not empty. Remove the empty ones.
        self.active_partitions = {
            p: tp
            for p, tp in self.active_partitions.items()
            if tp.partition in self.watermarks
            and self.watermarks[tp.partition].highmark > 0 and self.watermarks[
                tp.partition].highmark > self.watermarks[tp.partition].lowmark
        }
        # Cannot consume if there are no active partitions
        if not self.active_partitions:
            return {}

        self.consumer.assign(self.active_partitions.values())
        self.log.info("Consuming from %s", self.active_partitions)
        while not self.finished():
            try:
                message = self.consumer.next()
            except StopIteration:
                continue
            # Stop when reaching the last message written to the
            # __consumer_offsets topic when KafkaGroupReader first started
            if message.offset >= self.watermarks[
                    message.partition].highmark - 1:
                self.remove_partition_from_consumer(message.partition)
            self.process_consumer_offset_message(message)

        return {
            group: topics
            for group, topics in self.kafka_groups.items() if topics
        }

    def remove_partition_from_consumer(self, partition):
        deleted = self.active_partitions.pop(partition)
        # Terminate if there are no more partitions to consume
        if not self.active_partitions:
            self.log.info("Completed reading from all partitions")
            self._finished = True
            return
        # Reassign the remaining partitions to the consumer while saving the
        # position
        positions = [(p, self.consumer.position(p))
                     for p in self.active_partitions.values()]
        self.consumer.assign(self.active_partitions.values())
        for topic_partition, position in positions:
            self.consumer.seek(topic_partition, position)
        self.log.info(
            "Completed reading from %s. Remaining partitions: %s",
            deleted,
            self.active_partitions,
        )

    def parse_consumer_offset_message(self, message):
        key = bytearray(message.key)
        ((key_schema, ), cur) = relative_unpack(b'>h', key, 0)
        if key_schema not in [0, 1]:
            raise InvalidMessageException(
            )  # This is not an offset commit message
        (group, cur) = read_short_string(key, cur)
        (topic, cur) = read_short_string(key, cur)
        ((partition, ), cur) = relative_unpack(b'>l', key, cur)
        if message.value:
            value = bytearray(message.value)
            ((value_schema, ), cur) = relative_unpack(b'>h', value, 0)
            if value_schema not in [0, 1]:
                raise InvalidMessageException()  # Unrecognized message value
            ((offset, ), cur) = relative_unpack(b'>q', value, cur)
        else:
            offset = None  # Offset was deleted
        return str(group), str(topic), partition, offset

    def process_consumer_offset_message(self, message):
        try:
            group, topic, partition, offset = self.parse_consumer_offset_message(
                message)
        except InvalidMessageException:
            return

        if offset and (group not in self.kafka_groups
                       or topic not in self.kafka_groups[group]):
            self.kafka_groups[group].add(topic)
            self.log.info("Added group %s topic %s to list of groups", group,
                          topic)
        elif not offset and group in self.kafka_groups and \
                topic in self.kafka_groups[group]:  # No offset means topic deletion
            self.kafka_groups[group].discard(topic)
            self.log.info("Removed group %s topic %s from list of groups",
                          group, topic)

    def get_current_watermarks(self, partitions=None):
        client = KafkaToolClient(self.kafka_config.broker_list)
        client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC)
        offsets = get_topics_watermarks(
            client,
            [CONSUMER_OFFSET_TOPIC],
        )
        partitions_set = set(tp.partition
                             for tp in partitions) if partitions else None
        return {
            part: offset
            for part, offset in offsets[CONSUMER_OFFSET_TOPIC].iteritems()
            if offset.highmark > offset.lowmark and (
                partitions is None or part in partitions_set)
        }

    def finished(self):
        return self._finished
示例#7
0
def consume(args):
    settings = Global.settings
    writer = Global.writer
    reader = Global.reader

    schema = args.schema
    skip_error = args.skip_error
    auto_offset_reset = args.auto_offset_reset
    offset = args.offset

    topic = settings.kafka_topic
    tables_pk = {}
    schema_table = settings.schema_table.get(schema)
    tables = schema_table.get("tables")

    for table in tables:
        tables_pk[table] = reader.get_primary_key(schema, table)

    consumer = KafkaConsumer(
        bootstrap_servers=settings.kafka_server,
        value_deserializer=lambda x: json.loads(x, object_hook=object_hook),
        key_deserializer=lambda x: x.decode() if x else None,
        enable_auto_commit=False,
        group_id=schema,
        auto_offset_reset=auto_offset_reset,
    )
    partition = schema_table.get("kafka_partition")
    topic_partition = TopicPartition(topic, partition)
    consumer.assign([topic_partition])
    if offset:
        consumer.seek(topic_partition, offset)
    event_list = {}
    is_insert = False
    last_time = 0
    len_event = 0
    logger.info(
        f"success consume topic:{topic},partitions:{partition},schema:{schema},tables:{tables}"
    )

    for msg in consumer:  # type:ConsumerRecord
        logger.debug(f"kafka msg:{msg}")
        event = msg.value
        event_unixtime = event["event_unixtime"] / 10**6
        table = event["table"]
        schema = event["schema"]
        action = event["action"]

        if action == "query":
            alter_table = True
            query = event["values"]["query"]
        else:
            alter_table = False
            query = None
            event_list.setdefault(table, []).append(event)
            len_event += 1

        if last_time == 0:
            last_time = event_unixtime

        if len_event == settings.insert_num:
            is_insert = True
        else:
            if event_unixtime - last_time >= settings.insert_interval > 0:
                is_insert = True
        if is_insert or alter_table:
            data_dict = {}
            events_num = 0
            for table, items in event_list.items():
                for item in items:
                    action = item["action"]
                    action_core = item["action_core"]
                    data_dict.setdefault(table, {}).setdefault(
                        table + schema + action + action_core, []).append(item)
            for table, v in data_dict.items():
                tmp_data = []
                for k1, v1 in v.items():
                    events_num += len(v1)
                    tmp_data.append(v1)
                try:
                    result = writer.insert_event(tmp_data, schema, table,
                                                 tables_pk.get(table))
                    if not result:
                        logger.error("insert event error!")
                        if not skip_error:
                            exit()
                except Exception as e:
                    logger.error(f"insert event error!,error:{e}")
                    if not skip_error:
                        exit()
            if alter_table:
                try:
                    logger.info(f"execute query:{query}")
                    writer.execute(query)
                except Exception as e:
                    logger.error(f"execute query error!,error:{e}")
                    if not skip_error:
                        exit()
            consumer.commit()
            logger.info(f"commit success {events_num} events!")
            event_list = {}
            is_insert = False
            len_event = last_time = 0
示例#8
0
#*****************************************************************************************

# https://github.com/cuyu/python-demo/blob/master/demo_kafka.py
_TOPIC_NAME = 'anomaly'
_BROKERS = ['localhost:9092'
            ]  #['localhost.com:9092', 'systest-auto-deployer:9092']
_GROUP_ID = 'my_group'

consumer = KafkaConsumer(
    group_id='ddd',
    auto_offset_reset='smallest',  #largest
    enable_auto_commit=
    False,  ## true时,Consumer会在消费消息后将offset同步到zookeeper,这样当Consumer失败后,新的consumer就能从zookeeper获取最新的offset
    bootstrap_servers=_BROKERS)
# consumer = KafkaConsumer(bootstrap_servers=_BROKERS)
consumer.assign([TopicPartition(_TOPIC_NAME, 0)])
tp = TopicPartition(_TOPIC_NAME, 0)
print(consumer.committed(TopicPartition(_TOPIC_NAME, 0)))
# consumer.subscribe(topics=[_TOPIC_NAME])
# # Subscribe to a regex topic pattern
# consumer.subscribe(pattern='^awesome.*')
print(consumer.topics())
# partition = TopicPartition(topic=_TOPIC_NAME, partition=consumer.partitions_for_topic(_TOPIC_NAME))
# consumer.seek_to_beginning()
# consumer.seek(TopicPartition(_TOPIC_NAME, 0), 0)
consumer.seek(tp, 50)  # 10 stands for start consumer from 10th offset
a = []
for m in consumer:
    if len(a) < 5:
        print(m.offset)
        a.append(m.offset)
示例#9
0
文件: util.py 项目: Yelp/kafka-utils
class KafkaGroupReader:

    def __init__(self, kafka_config):
        self.log = logging.getLogger(__name__)
        self.kafka_config = kafka_config
        self._kafka_groups = defaultdict(lambda: defaultdict(dict))
        self.active_partitions = {}
        self._finished = False

    def read_group(self, group_id):
        partition_count = get_offset_topic_partition_count(self.kafka_config)
        partition = get_group_partition(group_id, partition_count)
        return self.read_groups(partition).get(group_id, [])

    def read_groups(self, partition=None):
        self.consumer = KafkaConsumer(
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='earliest',
            enable_auto_commit=False,
            consumer_timeout_ms=30000,
            fetch_max_wait_ms=2000,
            max_partition_fetch_bytes=10 * 1024 * 1024,  # 10MB
        )

        # Fetch metadata as partitions_for_topic only returns locally cached metadata
        # See https://github.com/dpkp/kafka-python/issues/1742
        self.consumer.topics()

        if partition is not None:
            self.active_partitions = {
                partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition),
            }
        else:
            self.active_partitions = {
                p: TopicPartition(CONSUMER_OFFSET_TOPIC, p)
                for p in self.consumer.partitions_for_topic(CONSUMER_OFFSET_TOPIC)
            }
        self.watermarks = self.get_current_watermarks(list(self.active_partitions.values()))
        # Active partitions are not empty. Remove the empty ones.
        self.active_partitions = {
            p: tp for p, tp in self.active_partitions.items()
            if tp.partition in self.watermarks and
            self.watermarks[tp.partition].highmark > 0 and
            self.watermarks[tp.partition].highmark > self.watermarks[tp.partition].lowmark
        }
        # Cannot consume if there are no active partitions
        if not self.active_partitions:
            return {}

        self.consumer.assign(list(self.active_partitions.values()))
        self.log.info("Consuming from %s", self.active_partitions)

        message_iterator = iter(self.consumer)

        while not self.finished():
            try:
                message = next(message_iterator)
            except StopIteration:
                continue
            # Stop when reaching the last message written to the
            # __consumer_offsets topic when KafkaGroupReader first started
            if message.offset >= self.watermarks[message.partition].highmark - 1:
                self.remove_partition_from_consumer(message.partition)
            self.process_consumer_offset_message(message)

        self._remove_unsubscribed_topics()

        return {
            group: topics.keys()
            for group, topics in six.iteritems(self._kafka_groups)
            if topics
        }

    def _remove_unsubscribed_topics(self):
        for group, topics in list(six.iteritems(self._kafka_groups)):
            for topic, partitions in list(six.iteritems(topics)):
                # If offsets for all partitions are 0, consider the topic as unsubscribed
                if not any(partitions.values()):
                    del self._kafka_groups[group][topic]
                    self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic))

    def remove_partition_from_consumer(self, partition):
        deleted = self.active_partitions.pop(partition)
        # Terminate if there are no more partitions to consume
        if not self.active_partitions:
            self.log.info("Completed reading from all partitions")
            self._finished = True
            return
        # Reassign the remaining partitions to the consumer while saving the
        # position
        positions = [
            (p, self.consumer.position(p))
            for p in self.active_partitions.values()
        ]
        self.consumer.assign(list(self.active_partitions.values()))
        for topic_partition, position in positions:
            self.consumer.seek(topic_partition, position)
        self.log.info(
            "Completed reading from %s. Remaining partitions: %s",
            deleted,
            self.active_partitions,
        )

    def parse_consumer_offset_message(self, message):
        key = message.key
        ((key_schema,), cur) = relative_unpack(b'>h', key, 0)
        if key_schema not in [0, 1]:
            raise InvalidMessageException()   # This is not an offset commit message
        (group, cur) = read_short_string(key, cur)
        (topic, cur) = read_short_string(key, cur)
        ((partition,), cur) = relative_unpack(b'>l', key, cur)
        if message.value:
            value = message.value
            ((value_schema,), cur) = relative_unpack(b'>h', value, 0)
            if value_schema not in [0, 1]:
                raise InvalidMessageException()  # Unrecognized message value
            ((offset,), cur) = relative_unpack(b'>q', value, cur)
        else:
            offset = None  # Offset was deleted
        return group.decode(), topic.decode(), partition, offset

    def process_consumer_offset_message(self, message):
        try:
            group, topic, partition, offset = self.parse_consumer_offset_message(message)
        except InvalidMessageException:
            return

        if offset is not None:
            self._kafka_groups[group][topic][partition] = offset
            self.log.info(
                "Updated group {group} topic {topic} and updated offset in list of groups".format(
                    group=group,
                    topic=topic,
                ),
            )
        # TODO: check if we can ever find an offset commit message with message.value is None
        elif offset is None and group in self._kafka_groups and \
                topic in self._kafka_groups[group]:  # No offset means topic deletion
            del self._kafka_groups[group][topic]
            self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic))

    def get_current_watermarks(self, partitions=None):
        client = KafkaToolClient(self.kafka_config.broker_list)
        client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC)
        offsets = get_topics_watermarks(
            client,
            [CONSUMER_OFFSET_TOPIC],
        )
        partitions_set = set(tp.partition for tp in partitions) if partitions else None
        return {part: offset for part, offset
                in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC])
                if offset.highmark > offset.lowmark and
                (partitions is None or part in partitions_set)}

    def finished(self):
        return self._finished